import warnings
warnings.filterwarnings('ignore')
pd.set_option('float_format', '{:f}'.format)
pd.set_option('display.max_columns', None)
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
from scipy.stats import norm
import pandas_profiling
import numpy as np
from sklearn.decomposition import PCA
from sklearn.cluster import KMeans
np.random.seed(sum(map(ord, "aesthetics")))
from sklearn import metrics
import seaborn as sns
#loading csv
iris_dataset = pd.read_csv('/Users/rajmati.marlecha/Desktop/DMGAssignment/iris-species/Iris.csv')
iris_dataset.head()
| Id | SepalLengthCm | SepalWidthCm | PetalLengthCm | PetalWidthCm | Species | |
|---|---|---|---|---|---|---|
| 0 | 1 | 5.1 | 3.5 | 1.4 | 0.2 | Iris-setosa |
| 1 | 2 | 4.9 | 3.0 | 1.4 | 0.2 | Iris-setosa |
| 2 | 3 | 4.7 | 3.2 | 1.3 | 0.2 | Iris-setosa |
| 3 | 4 | 4.6 | 3.1 | 1.5 | 0.2 | Iris-setosa |
| 4 | 5 | 5.0 | 3.6 | 1.4 | 0.2 | Iris-setosa |
#1 Histograms without labels for all 4 dimensions
f, (ax1,ax2,ax3,ax4) = plt.subplots(1, 4, figsize=(20,4))
ax1.set_title('SepalLengthCm')
ax2.set_title('SepalWidthCm')
ax3.set_title('PetalLengthCm')
ax4.set_title('PetalWidthCm')
sns.distplot(iris_dataset['SepalLengthCm'],ax=ax1)
sns.distplot(iris_dataset['SepalWidthCm'],ax=ax2)
sns.distplot(iris_dataset['PetalLengthCm'],ax=ax3)
sns.distplot(iris_dataset['PetalWidthCm'],ax=ax4)
plt.show()
#1 Histograms with labels : Species for all 4 dimensions
g = sns.FacetGrid(iris_dataset, col="Species")
g = g.map(plt.hist, "SepalLengthCm")
g = sns.FacetGrid(iris_dataset, col="Species")
g = g.map(plt.hist, "SepalWidthCm")
g = sns.FacetGrid(iris_dataset, col="Species")
g = g.map(plt.hist, "PetalLengthCm")
g = sns.FacetGrid(iris_dataset, col="Species")
g = g.map(plt.hist, "PetalWidthCm")
* Iris-setosa : 0.1 to 0.6
* Iris-versicolor : 1.0 to 1.8
* Iris-virginica : 1.4 to 2.5
PetalLengthCm is not the candidate as there is some overlap between the values for Iris-versicolor and Iris-virginica, hence its is not the one
* Iris-setosa : 0.1 to 1.9
* Iris-versicolor : 3.0 to 5.1
* Iris-virginica : 4.5 to 6.9
#Extra distribution
#pandas_profiling.ProfileReport(iris_dataset)
#Extra
Dataset info
| Number of variables | 6 |
|---|---|
| Number of observations | 150 |
| Total Missing (%) | 0.0% |
| Total size in memory | 7.1 KiB |
| Average record size in memory | 48.5 B |
Variables types
| Numeric | 4 |
|---|---|
| Categorical | 1 |
| Boolean | 0 |
| Date | 0 |
| Text (Unique) | 0 |
| Rejected | 1 |
| Unsupported | 0 |
Warnings
PetalWidthCm is highly correlated with PetalLengthCm (ρ = 0.96276) RejectedId
Numeric
| Distinct count | 150 |
|---|---|
| Unique (%) | 100.0% |
| Missing (%) | 0.0% |
| Missing (n) | 0 |
| Infinite (%) | 0.0% |
| Infinite (n) | 0 |
| Mean | 75.5 |
|---|---|
| Minimum | 1 |
| Maximum | 150 |
| Zeros (%) | 0.0% |
Quantile statistics
| Minimum | 1 |
|---|---|
| 5-th percentile | 8.45 |
| Q1 | 38.25 |
| Median | 75.5 |
| Q3 | 112.75 |
| 95-th percentile | 142.55 |
| Maximum | 150 |
| Range | 149 |
| Interquartile range | 74.5 |
Descriptive statistics
| Standard deviation | 43.445 |
|---|---|
| Coef of variation | 0.57544 |
| Kurtosis | -1.2 |
| Mean | 75.5 |
| MAD | 37.5 |
| Skewness | 0 |
| Sum | 11325 |
| Variance | 1887.5 |
| Memory size | 1.2 KiB |
| Value | Count | Frequency (%) | |
| 150 | 1 | 0.7% |
|
| 56 | 1 | 0.7% |
|
| 54 | 1 | 0.7% |
|
| 53 | 1 | 0.7% |
|
| 52 | 1 | 0.7% |
|
| 51 | 1 | 0.7% |
|
| 50 | 1 | 0.7% |
|
| 49 | 1 | 0.7% |
|
| 48 | 1 | 0.7% |
|
| 47 | 1 | 0.7% |
|
| Other values (140) | 140 | 93.3% |
|
Minimum 5 values
| Value | Count | Frequency (%) | |
| 1 | 1 | 0.7% |
|
| 2 | 1 | 0.7% |
|
| 3 | 1 | 0.7% |
|
| 4 | 1 | 0.7% |
|
| 5 | 1 | 0.7% |
|
Maximum 5 values
| Value | Count | Frequency (%) | |
| 146 | 1 | 0.7% |
|
| 147 | 1 | 0.7% |
|
| 148 | 1 | 0.7% |
|
| 149 | 1 | 0.7% |
|
| 150 | 1 | 0.7% |
|
PetalLengthCm
Numeric
| Distinct count | 43 |
|---|---|
| Unique (%) | 28.7% |
| Missing (%) | 0.0% |
| Missing (n) | 0 |
| Infinite (%) | 0.0% |
| Infinite (n) | 0 |
| Mean | 3.7587 |
|---|---|
| Minimum | 1 |
| Maximum | 6.9 |
| Zeros (%) | 0.0% |
Quantile statistics
| Minimum | 1 |
|---|---|
| 5-th percentile | 1.3 |
| Q1 | 1.6 |
| Median | 4.35 |
| Q3 | 5.1 |
| 95-th percentile | 6.1 |
| Maximum | 6.9 |
| Range | 5.9 |
| Interquartile range | 3.5 |
Descriptive statistics
| Standard deviation | 1.7644 |
|---|---|
| Coef of variation | 0.46943 |
| Kurtosis | -1.4019 |
| Mean | 3.7587 |
| MAD | 1.5619 |
| Skewness | -0.27446 |
| Sum | 563.8 |
| Variance | 3.1132 |
| Memory size | 1.2 KiB |
| Value | Count | Frequency (%) | |
| 1.5 | 14 | 9.3% |
|
| 1.4 | 12 | 8.0% |
|
| 5.1 | 8 | 5.3% |
|
| 4.5 | 8 | 5.3% |
|
| 1.3 | 7 | 4.7% |
|
| 1.6 | 7 | 4.7% |
|
| 5.6 | 6 | 4.0% |
|
| 4.0 | 5 | 3.3% |
|
| 4.9 | 5 | 3.3% |
|
| 4.7 | 5 | 3.3% |
|
| Other values (33) | 73 | 48.7% |
|
Minimum 5 values
| Value | Count | Frequency (%) | |
| 1.0 | 1 | 0.7% |
|
| 1.1 | 1 | 0.7% |
|
| 1.2 | 2 | 1.3% |
|
| 1.3 | 7 | 4.7% |
|
| 1.4 | 12 | 8.0% |
|
Maximum 5 values
| Value | Count | Frequency (%) | |
| 6.3 | 1 | 0.7% |
|
| 6.4 | 1 | 0.7% |
|
| 6.6 | 1 | 0.7% |
|
| 6.7 | 2 | 1.3% |
|
| 6.9 | 1 | 0.7% |
|
PetalWidthCm
Highly correlated
This variable is highly correlated with PetalLengthCm and should be ignored for analysis
| Correlation | 0.96276 |
|---|
SepalLengthCm
Numeric
| Distinct count | 35 |
|---|---|
| Unique (%) | 23.3% |
| Missing (%) | 0.0% |
| Missing (n) | 0 |
| Infinite (%) | 0.0% |
| Infinite (n) | 0 |
| Mean | 5.8433 |
|---|---|
| Minimum | 4.3 |
| Maximum | 7.9 |
| Zeros (%) | 0.0% |
Quantile statistics
| Minimum | 4.3 |
|---|---|
| 5-th percentile | 4.6 |
| Q1 | 5.1 |
| Median | 5.8 |
| Q3 | 6.4 |
| 95-th percentile | 7.255 |
| Maximum | 7.9 |
| Range | 3.6 |
| Interquartile range | 1.3 |
Descriptive statistics
| Standard deviation | 0.82807 |
|---|---|
| Coef of variation | 0.14171 |
| Kurtosis | -0.55206 |
| Mean | 5.8433 |
| MAD | 0.68756 |
| Skewness | 0.31491 |
| Sum | 876.5 |
| Variance | 0.68569 |
| Memory size | 1.2 KiB |
| Value | Count | Frequency (%) | |
| 5.0 | 10 | 6.7% |
|
| 6.3 | 9 | 6.0% |
|
| 5.1 | 9 | 6.0% |
|
| 6.7 | 8 | 5.3% |
|
| 5.7 | 8 | 5.3% |
|
| 5.5 | 7 | 4.7% |
|
| 5.8 | 7 | 4.7% |
|
| 6.4 | 7 | 4.7% |
|
| 6.0 | 6 | 4.0% |
|
| 4.9 | 6 | 4.0% |
|
| Other values (25) | 73 | 48.7% |
|
Minimum 5 values
| Value | Count | Frequency (%) | |
| 4.3 | 1 | 0.7% |
|
| 4.4 | 3 | 2.0% |
|
| 4.5 | 1 | 0.7% |
|
| 4.6 | 4 | 2.7% |
|
| 4.7 | 2 | 1.3% |
|
Maximum 5 values
| Value | Count | Frequency (%) | |
| 7.3 | 1 | 0.7% |
|
| 7.4 | 1 | 0.7% |
|
| 7.6 | 1 | 0.7% |
|
| 7.7 | 4 | 2.7% |
|
| 7.9 | 1 | 0.7% |
|
SepalWidthCm
Numeric
| Distinct count | 23 |
|---|---|
| Unique (%) | 15.3% |
| Missing (%) | 0.0% |
| Missing (n) | 0 |
| Infinite (%) | 0.0% |
| Infinite (n) | 0 |
| Mean | 3.054 |
|---|---|
| Minimum | 2 |
| Maximum | 4.4 |
| Zeros (%) | 0.0% |
Quantile statistics
| Minimum | 2 |
|---|---|
| 5-th percentile | 2.345 |
| Q1 | 2.8 |
| Median | 3 |
| Q3 | 3.3 |
| 95-th percentile | 3.8 |
| Maximum | 4.4 |
| Range | 2.4 |
| Interquartile range | 0.5 |
Descriptive statistics
| Standard deviation | 0.43359 |
|---|---|
| Coef of variation | 0.14198 |
| Kurtosis | 0.29078 |
| Mean | 3.054 |
| MAD | 0.33309 |
| Skewness | 0.33405 |
| Sum | 458.1 |
| Variance | 0.188 |
| Memory size | 1.2 KiB |
| Value | Count | Frequency (%) | |
| 3.0 | 26 | 17.3% |
|
| 2.8 | 14 | 9.3% |
|
| 3.2 | 13 | 8.7% |
|
| 3.4 | 12 | 8.0% |
|
| 3.1 | 12 | 8.0% |
|
| 2.9 | 10 | 6.7% |
|
| 2.7 | 9 | 6.0% |
|
| 2.5 | 8 | 5.3% |
|
| 3.5 | 6 | 4.0% |
|
| 3.8 | 6 | 4.0% |
|
| Other values (13) | 34 | 22.7% |
|
Minimum 5 values
| Value | Count | Frequency (%) | |
| 2.0 | 1 | 0.7% |
|
| 2.2 | 3 | 2.0% |
|
| 2.3 | 4 | 2.7% |
|
| 2.4 | 3 | 2.0% |
|
| 2.5 | 8 | 5.3% |
|
Maximum 5 values
| Value | Count | Frequency (%) | |
| 3.9 | 2 | 1.3% |
|
| 4.0 | 1 | 0.7% |
|
| 4.1 | 1 | 0.7% |
|
| 4.2 | 1 | 0.7% |
|
| 4.4 | 1 | 0.7% |
|
Species
Categorical
| Distinct count | 3 |
|---|---|
| Unique (%) | 2.0% |
| Missing (%) | 0.0% |
| Missing (n) | 0 |
| Iris-versicolor |
50
|
|---|---|
| Iris-virginica |
50
|
| Iris-setosa |
50
|
| Value | Count | Frequency (%) | |
| Iris-versicolor | 50 | 33.3% |
|
| Iris-virginica | 50 | 33.3% |
|
| Iris-setosa | 50 | 33.3% |
|
| Id | SepalLengthCm | SepalWidthCm | PetalLengthCm | PetalWidthCm | Species | |
|---|---|---|---|---|---|---|
| 0 | 1 | 5.1 | 3.5 | 1.4 | 0.2 | Iris-setosa |
| 1 | 2 | 4.9 | 3.0 | 1.4 | 0.2 | Iris-setosa |
| 2 | 3 | 4.7 | 3.2 | 1.3 | 0.2 | Iris-setosa |
| 3 | 4 | 4.6 | 3.1 | 1.5 | 0.2 | Iris-setosa |
| 4 | 5 | 5.0 | 3.6 | 1.4 | 0.2 | Iris-setosa |
#loading csv
#2
boson_dataset = pd.read_csv('/Users/rajmati.marlecha/Desktop/DMGAssignment/higgs-boson/training.csv')
boson_dataset.head()
| EventId | DER_mass_MMC | DER_mass_transverse_met_lep | DER_mass_vis | DER_pt_h | DER_deltaeta_jet_jet | DER_mass_jet_jet | DER_prodeta_jet_jet | DER_deltar_tau_lep | DER_pt_tot | ... | PRI_jet_num | PRI_jet_leading_pt | PRI_jet_leading_eta | PRI_jet_leading_phi | PRI_jet_subleading_pt | PRI_jet_subleading_eta | PRI_jet_subleading_phi | PRI_jet_all_pt | Weight | Label | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 100000 | 138.470 | 51.655 | 97.827 | 27.980 | 0.91 | 124.711 | 2.666 | 3.064 | 41.928 | ... | 2 | 67.435 | 2.150 | 0.444 | 46.062 | 1.24 | -2.475 | 113.497 | 0.002653 | s |
| 1 | 100001 | 160.937 | 68.768 | 103.235 | 48.146 | -999.00 | -999.000 | -999.000 | 3.473 | 2.078 | ... | 1 | 46.226 | 0.725 | 1.158 | -999.000 | -999.00 | -999.000 | 46.226 | 2.233584 | b |
| 2 | 100002 | -999.000 | 162.172 | 125.953 | 35.635 | -999.00 | -999.000 | -999.000 | 3.148 | 9.336 | ... | 1 | 44.251 | 2.053 | -2.028 | -999.000 | -999.00 | -999.000 | 44.251 | 2.347389 | b |
| 3 | 100003 | 143.905 | 81.417 | 80.943 | 0.414 | -999.00 | -999.000 | -999.000 | 3.310 | 0.414 | ... | 0 | -999.000 | -999.000 | -999.000 | -999.000 | -999.00 | -999.000 | -0.000 | 5.446378 | b |
| 4 | 100004 | 175.864 | 16.915 | 134.805 | 16.405 | -999.00 | -999.000 | -999.000 | 3.891 | 16.405 | ... | 0 | -999.000 | -999.000 | -999.000 | -999.000 | -999.00 | -999.000 | 0.000 | 6.245333 | b |
5 rows × 33 columns
#loading csv
boson_dataset_test = pd.read_csv('/Users/rajmati.marlecha/Desktop/DMGAssignment/higgs-boson/test.csv')
boson_dataset_test.head()
| EventId | DER_mass_MMC | DER_mass_transverse_met_lep | DER_mass_vis | DER_pt_h | DER_deltaeta_jet_jet | DER_mass_jet_jet | DER_prodeta_jet_jet | DER_deltar_tau_lep | DER_pt_tot | ... | PRI_met_phi | PRI_met_sumet | PRI_jet_num | PRI_jet_leading_pt | PRI_jet_leading_eta | PRI_jet_leading_phi | PRI_jet_subleading_pt | PRI_jet_subleading_eta | PRI_jet_subleading_phi | PRI_jet_all_pt | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 350000 | -999.000 | 79.589 | 23.916 | 3.036 | -999.000 | -999.000 | -999.000 | 0.903 | 3.036 | ... | 2.022 | 98.556 | 0 | -999.000 | -999.000 | -999.000 | -999.000 | -999.000 | -999.000 | -0.000 |
| 1 | 350001 | 106.398 | 67.490 | 87.949 | 49.994 | -999.000 | -999.000 | -999.000 | 2.048 | 2.679 | ... | -1.138 | 176.251 | 1 | 47.575 | -0.553 | -0.849 | -999.000 | -999.000 | -999.000 | 47.575 |
| 2 | 350002 | 117.794 | 56.226 | 96.358 | 4.137 | -999.000 | -999.000 | -999.000 | 2.755 | 4.137 | ... | -1.868 | 111.505 | 0 | -999.000 | -999.000 | -999.000 | -999.000 | -999.000 | -999.000 | 0.000 |
| 3 | 350003 | 135.861 | 30.604 | 97.288 | 9.104 | -999.000 | -999.000 | -999.000 | 2.811 | 9.104 | ... | 1.172 | 164.707 | 0 | -999.000 | -999.000 | -999.000 | -999.000 | -999.000 | -999.000 | 0.000 |
| 4 | 350004 | 74.159 | 82.772 | 58.731 | 89.646 | 1.347 | 536.663 | -0.339 | 1.028 | 77.213 | ... | -0.231 | 869.614 | 3 | 254.085 | -1.013 | -0.334 | 185.857 | 0.335 | 2.587 | 599.213 |
5 rows × 31 columns
boson_dataset.columns.size
33
boson_dataset.Label.unique()
array(['s', 'b'], dtype=object)
print(boson_dataset[boson_dataset.columns.difference(['EventId', 'Label'])].columns)
Index(['DER_deltaeta_jet_jet', 'DER_deltar_tau_lep', 'DER_lep_eta_centrality',
'DER_mass_MMC', 'DER_mass_jet_jet', 'DER_mass_transverse_met_lep',
'DER_mass_vis', 'DER_met_phi_centrality', 'DER_prodeta_jet_jet',
'DER_pt_h', 'DER_pt_ratio_lep_tau', 'DER_pt_tot', 'DER_sum_pt',
'PRI_jet_all_pt', 'PRI_jet_leading_eta', 'PRI_jet_leading_phi',
'PRI_jet_leading_pt', 'PRI_jet_num', 'PRI_jet_subleading_eta',
'PRI_jet_subleading_phi', 'PRI_jet_subleading_pt', 'PRI_lep_eta',
'PRI_lep_phi', 'PRI_lep_pt', 'PRI_met', 'PRI_met_phi', 'PRI_met_sumet',
'PRI_tau_eta', 'PRI_tau_phi', 'PRI_tau_pt', 'Weight'],
dtype='object')
#pandas_profiling.ProfileReport(boson_dataset)
Dataset info
| Number of variables | 37 |
|---|---|
| Number of observations | 250000 |
| Total Missing (%) | 0.0% |
| Total size in memory | 70.6 MiB |
| Average record size in memory | 296.0 B |
Variables types
| Numeric | 23 |
|---|---|
| Categorical | 1 |
| Boolean | 0 |
| Date | 0 |
| Text (Unique) | 0 |
| Rejected | 13 |
| Unsupported | 0 |
Warnings
DER_lep_eta_centrality is highly correlated with DER_prodeta_jet_jet (ρ = 0.99999) RejectedDER_mass_jet_jet is highly correlated with DER_deltaeta_jet_jet (ρ = 0.94604) RejectedDER_mass_vis_log is highly correlated with DER_mass_vis (ρ = 0.90108) RejectedDER_prodeta_jet_jet is highly correlated with DER_mass_jet_jet (ρ = 0.94444) RejectedDER_pt_ratio_lep_tau_log is highly correlated with DER_pt_ratio_lep_tau (ρ = 0.90512) RejectedPRI_jet_all_pt is highly correlated with DER_sum_pt (ρ = 0.96563) RejectedPRI_jet_leading_eta is highly correlated with PRI_jet_leading_pt (ρ = 0.9961) RejectedPRI_jet_leading_phi is highly correlated with PRI_jet_leading_eta (ρ = 0.99999) RejectedPRI_jet_num has 99913 / 40.0% zeros ZerosPRI_jet_subleading_eta is highly correlated with PRI_jet_subleading_pt (ρ = 0.99935) RejectedPRI_jet_subleading_phi is highly correlated with PRI_jet_subleading_eta (ρ = 0.99999) RejectedPRI_jet_subleading_pt is highly correlated with DER_lep_eta_centrality (ρ = 0.99935) RejectedPRI_met_sumet is highly correlated with DER_sum_pt (ρ = 0.90448) RejectedPRI_met_sumet_log is highly correlated with PRI_met_sumet (ρ = 0.92121) RejectedDER_deltaeta_jet_jet
Numeric
| Distinct count | 7087 |
|---|---|
| Unique (%) | 2.8% |
| Missing (%) | 0.0% |
| Missing (n) | 0 |
| Infinite (%) | 0.0% |
| Infinite (n) | 0 |
| Mean | -708.42 |
|---|---|
| Minimum | -999 |
| Maximum | 8.503 |
| Zeros (%) | 0.0% |
Quantile statistics
| Minimum | -999 |
|---|---|
| 5-th percentile | -999 |
| Q1 | -999 |
| Median | -999 |
| Q3 | 0.49 |
| 95-th percentile | 4.276 |
| Maximum | 8.503 |
| Range | 1007.5 |
| Interquartile range | 999.49 |
Descriptive statistics
| Standard deviation | 454.48 |
|---|---|
| Coef of variation | -0.64154 |
| Kurtosis | -1.1449 |
| Mean | -708.42 |
| MAD | 412.52 |
| Skewness | 0.92469 |
| Sum | -177110000 |
| Variance | 206550 |
| Memory size | 1.9 MiB |
| Value | Count | Frequency (%) | |
| -999.0 | 177457 | 71.0% |
|
| 0.326 | 33 | 0.0% |
|
| 0.43200000000000005 | 32 | 0.0% |
|
| 0.5329999999999999 | 32 | 0.0% |
|
| 0.574 | 32 | 0.0% |
|
| 1.2930000000000001 | 32 | 0.0% |
|
| 0.254 | 32 | 0.0% |
|
| 0.792 | 32 | 0.0% |
|
| 0.408 | 32 | 0.0% |
|
| 0.087 | 31 | 0.0% |
|
| Other values (7077) | 72255 | 28.9% |
|
Minimum 5 values
| Value | Count | Frequency (%) | |
| -999.0 | 177457 | 71.0% |
|
| 0.0 | 6 | 0.0% |
|
| 0.001 | 20 | 0.0% |
|
| 0.002 | 28 | 0.0% |
|
| 0.003 | 23 | 0.0% |
|
Maximum 5 values
| Value | Count | Frequency (%) | |
| 8.287 | 1 | 0.0% |
|
| 8.301 | 1 | 0.0% |
|
| 8.326 | 1 | 0.0% |
|
| 8.459 | 1 | 0.0% |
|
| 8.503 | 1 | 0.0% |
|
DER_deltar_tau_lep
Numeric
| Distinct count | 4692 |
|---|---|
| Unique (%) | 1.9% |
| Missing (%) | 0.0% |
| Missing (n) | 0 |
| Infinite (%) | 0.0% |
| Infinite (n) | 0 |
| Mean | 2.3731 |
|---|---|
| Minimum | 0.208 |
| Maximum | 5.684 |
| Zeros (%) | 0.0% |
Quantile statistics
| Minimum | 0.208 |
|---|---|
| 5-th percentile | 0.973 |
| Q1 | 1.81 |
| Median | 2.4915 |
| Q3 | 2.961 |
| 95-th percentile | 3.441 |
| Maximum | 5.684 |
| Range | 5.476 |
| Interquartile range | 1.151 |
Descriptive statistics
| Standard deviation | 0.78291 |
|---|---|
| Coef of variation | 0.32991 |
| Kurtosis | -0.22245 |
| Mean | 2.3731 |
| MAD | 0.64234 |
| Skewness | -0.21578 |
| Sum | 593270 |
| Variance | 0.61295 |
| Memory size | 1.9 MiB |
| Value | Count | Frequency (%) | |
| 3.094 | 202 | 0.1% |
|
| 3.1310000000000002 | 201 | 0.1% |
|
| 3.0780000000000003 | 199 | 0.1% |
|
| 3.117 | 197 | 0.1% |
|
| 2.904 | 194 | 0.1% |
|
| 3.0980000000000003 | 193 | 0.1% |
|
| 3.133 | 193 | 0.1% |
|
| 3.12 | 193 | 0.1% |
|
| 3.1069999999999998 | 191 | 0.1% |
|
| 3.1 | 190 | 0.1% |
|
| Other values (4682) | 248047 | 99.2% |
|
Minimum 5 values
| Value | Count | Frequency (%) | |
| 0.20800000000000002 | 1 | 0.0% |
|
| 0.22399999999999998 | 1 | 0.0% |
|
| 0.228 | 1 | 0.0% |
|
| 0.24600000000000002 | 1 | 0.0% |
|
| 0.256 | 1 | 0.0% |
|
Maximum 5 values
| Value | Count | Frequency (%) | |
| 5.624 | 1 | 0.0% |
|
| 5.626 | 1 | 0.0% |
|
| 5.642 | 1 | 0.0% |
|
| 5.655 | 1 | 0.0% |
|
| 5.684 | 1 | 0.0% |
|
DER_lep_eta_centrality
Highly correlated
This variable is highly correlated with DER_prodeta_jet_jet and should be ignored for analysis
| Correlation | 0.99999 |
|---|
DER_mass_MMC
Numeric
| Distinct count | 108338 |
|---|---|
| Unique (%) | 43.3% |
| Missing (%) | 0.0% |
| Missing (n) | 0 |
| Infinite (%) | 0.0% |
| Infinite (n) | 0 |
| Mean | -49.023 |
|---|---|
| Minimum | -999 |
| Maximum | 1192 |
| Zeros (%) | 0.0% |
Quantile statistics
| Minimum | -999 |
|---|---|
| 5-th percentile | -999 |
| Q1 | 78.101 |
| Median | 105.01 |
| Q3 | 130.61 |
| 95-th percentile | 201.81 |
| Maximum | 1192 |
| Range | 2191 |
| Interquartile range | 52.505 |
Descriptive statistics
| Standard deviation | 406.35 |
|---|---|
| Coef of variation | -8.2889 |
| Kurtosis | 1.6242 |
| Mean | -49.023 |
| MAD | 289.66 |
| Skewness | -1.8547 |
| Sum | -12256000 |
| Variance | 165120 |
| Memory size | 1.9 MiB |
| Value | Count | Frequency (%) | |
| -999.0 | 38114 | 15.2% |
|
| 121.26100000000001 | 10 | 0.0% |
|
| 113.965 | 10 | 0.0% |
|
| 125.46600000000001 | 10 | 0.0% |
|
| 108.914 | 10 | 0.0% |
|
| 132.292 | 10 | 0.0% |
|
| 108.95700000000001 | 10 | 0.0% |
|
| 103.762 | 10 | 0.0% |
|
| 96.819 | 10 | 0.0% |
|
| 111.12299999999999 | 10 | 0.0% |
|
| Other values (108328) | 211796 | 84.7% |
|
Minimum 5 values
| Value | Count | Frequency (%) | |
| -999.0 | 38114 | 15.2% |
|
| 9.044 | 1 | 0.0% |
|
| 9.222000000000001 | 1 | 0.0% |
|
| 9.652999999999999 | 1 | 0.0% |
|
| 9.806000000000001 | 1 | 0.0% |
|
Maximum 5 values
| Value | Count | Frequency (%) | |
| 980.192 | 1 | 0.0% |
|
| 985.102 | 1 | 0.0% |
|
| 987.561 | 1 | 0.0% |
|
| 988.199 | 1 | 0.0% |
|
| 1192.026 | 1 | 0.0% |
|
DER_mass_jet_jet
Highly correlated
This variable is highly correlated with DER_deltaeta_jet_jet and should be ignored for analysis
| Correlation | 0.94604 |
|---|
DER_mass_transverse_met_lep
Numeric
| Distinct count | 101637 |
|---|---|
| Unique (%) | 40.7% |
| Missing (%) | 0.0% |
| Missing (n) | 0 |
| Infinite (%) | 0.0% |
| Infinite (n) | 0 |
| Mean | 49.24 |
|---|---|
| Minimum | 0 |
| Maximum | 690.08 |
| Zeros (%) | 0.0% |
Quantile statistics
| Minimum | 0 |
|---|---|
| 5-th percentile | 3.389 |
| Q1 | 19.241 |
| Median | 46.524 |
| Q3 | 73.598 |
| 95-th percentile | 104.64 |
| Maximum | 690.08 |
| Range | 690.08 |
| Interquartile range | 54.357 |
Descriptive statistics
| Standard deviation | 35.345 |
|---|---|
| Coef of variation | 0.71781 |
| Kurtosis | 6.3668 |
| Mean | 49.24 |
| MAD | 28.607 |
| Skewness | 1.2192 |
| Sum | 12310000 |
| Variance | 1249.3 |
| Memory size | 1.9 MiB |
| Value | Count | Frequency (%) | |
| 1.8 | 13 | 0.0% |
|
| 9.362 | 12 | 0.0% |
|
| 2.5 | 12 | 0.0% |
|
| 3.1830000000000003 | 12 | 0.0% |
|
| 3.5010000000000003 | 12 | 0.0% |
|
| 0.113 | 11 | 0.0% |
|
| 11.530999999999999 | 11 | 0.0% |
|
| 8.967 | 11 | 0.0% |
|
| 6.335 | 11 | 0.0% |
|
| 11.089 | 11 | 0.0% |
|
| Other values (101627) | 249884 | 100.0% |
|
Minimum 5 values
| Value | Count | Frequency (%) | |
| 0.0 | 3 | 0.0% |
|
| 0.001 | 1 | 0.0% |
|
| 0.002 | 4 | 0.0% |
|
| 0.003 | 4 | 0.0% |
|
| 0.004 | 2 | 0.0% |
|
Maximum 5 values
| Value | Count | Frequency (%) | |
| 570.115 | 1 | 0.0% |
|
| 571.868 | 1 | 0.0% |
|
| 594.2869999999999 | 1 | 0.0% |
|
| 595.819 | 1 | 0.0% |
|
| 690.075 | 1 | 0.0% |
|
DER_mass_vis
Numeric
| Distinct count | 100558 |
|---|---|
| Unique (%) | 40.2% |
| Missing (%) | 0.0% |
| Missing (n) | 0 |
| Infinite (%) | 0.0% |
| Infinite (n) | 0 |
| Mean | 81.182 |
|---|---|
| Minimum | 6.329 |
| Maximum | 1349.4 |
| Zeros (%) | 0.0% |
Quantile statistics
| Minimum | 6.329 |
|---|---|
| 5-th percentile | 37.874 |
| Q1 | 59.389 |
| Median | 73.752 |
| Q3 | 92.259 |
| 95-th percentile | 149.27 |
| Maximum | 1349.4 |
| Range | 1343 |
| Interquartile range | 32.87 |
Descriptive statistics
| Standard deviation | 40.829 |
|---|---|
| Coef of variation | 0.50293 |
| Kurtosis | 35.494 |
| Mean | 81.182 |
| MAD | 25.454 |
| Skewness | 3.7903 |
| Sum | 20295000 |
| Variance | 1667 |
| Memory size | 1.9 MiB |
| Value | Count | Frequency (%) | |
| 76.819 | 16 | 0.0% |
|
| 61.286 | 15 | 0.0% |
|
| 70.41199999999999 | 14 | 0.0% |
|
| 63.648 | 13 | 0.0% |
|
| 68.039 | 13 | 0.0% |
|
| 71.097 | 13 | 0.0% |
|
| 59.87 | 13 | 0.0% |
|
| 68.752 | 13 | 0.0% |
|
| 62.044 | 13 | 0.0% |
|
| 79.36 | 13 | 0.0% |
|
| Other values (100548) | 249864 | 99.9% |
|
Minimum 5 values
| Value | Count | Frequency (%) | |
| 6.329 | 1 | 0.0% |
|
| 6.462000000000001 | 1 | 0.0% |
|
| 7.12 | 1 | 0.0% |
|
| 7.202000000000001 | 1 | 0.0% |
|
| 7.33 | 1 | 0.0% |
|
Maximum 5 values
| Value | Count | Frequency (%) | |
| 1034.205 | 1 | 0.0% |
|
| 1051.358 | 1 | 0.0% |
|
| 1153.1660000000002 | 1 | 0.0% |
|
| 1329.9129999999998 | 1 | 0.0% |
|
| 1349.351 | 1 | 0.0% |
|
DER_mass_vis_log
Highly correlated
This variable is highly correlated with DER_mass_vis and should be ignored for analysis
| Correlation | 0.90108 |
|---|
DER_met_phi_centrality
Numeric
| Distinct count | 2829 |
|---|---|
| Unique (%) | 1.1% |
| Missing (%) | 0.0% |
| Missing (n) | 0 |
| Infinite (%) | 0.0% |
| Infinite (n) | 0 |
| Mean | -0.1283 |
|---|---|
| Minimum | -1.414 |
| Maximum | 1.414 |
| Zeros (%) | 0.0% |
Quantile statistics
| Minimum | -1.414 |
|---|---|
| 5-th percentile | -1.413 |
| Q1 | -1.371 |
| Median | -0.356 |
| Q3 | 1.225 |
| 95-th percentile | 1.412 |
| Maximum | 1.414 |
| Range | 2.828 |
| Interquartile range | 2.596 |
Descriptive statistics
| Standard deviation | 1.1936 |
|---|---|
| Coef of variation | -9.3027 |
| Kurtosis | -1.7681 |
| Mean | -0.1283 |
| MAD | 1.1308 |
| Skewness | 0.15114 |
| Sum | -32076 |
| Variance | 1.4246 |
| Memory size | 1.9 MiB |
| Value | Count | Frequency (%) | |
| -1.4140000000000001 | 11429 | 4.6% |
|
| 1.4140000000000001 | 7778 | 3.1% |
|
| -1.413 | 5227 | 2.1% |
|
| -1.412 | 3514 | 1.4% |
|
| 1.413 | 3345 | 1.3% |
|
| -1.411 | 2820 | 1.1% |
|
| -1.41 | 2341 | 0.9% |
|
| 1.412 | 2205 | 0.9% |
|
| -1.409 | 2178 | 0.9% |
|
| -1.4080000000000001 | 1929 | 0.8% |
|
| Other values (2819) | 207234 | 82.9% |
|
Minimum 5 values
| Value | Count | Frequency (%) | |
| -1.4140000000000001 | 11429 | 4.6% |
|
| -1.413 | 5227 | 2.1% |
|
| -1.412 | 3514 | 1.4% |
|
| -1.411 | 2820 | 1.1% |
|
| -1.41 | 2341 | 0.9% |
|
Maximum 5 values
| Value | Count | Frequency (%) | |
| 1.41 | 1450 | 0.6% |
|
| 1.411 | 1724 | 0.7% |
|
| 1.412 | 2205 | 0.9% |
|
| 1.413 | 3345 | 1.3% |
|
| 1.4140000000000001 | 7778 | 3.1% |
|
DER_prodeta_jet_jet
Highly correlated
This variable is highly correlated with DER_mass_jet_jet and should be ignored for analysis
| Correlation | 0.94444 |
|---|
DER_pt_h
Numeric
| Distinct count | 115563 |
|---|---|
| Unique (%) | 46.2% |
| Missing (%) | 0.0% |
| Missing (n) | 0 |
| Infinite (%) | 0.0% |
| Infinite (n) | 0 |
| Mean | 57.896 |
|---|---|
| Minimum | 0 |
| Maximum | 2835 |
| Zeros (%) | 0.0% |
Quantile statistics
| Minimum | 0 |
|---|---|
| 5-th percentile | 1.201 |
| Q1 | 14.069 |
| Median | 38.468 |
| Q3 | 79.169 |
| 95-th percentile | 183.49 |
| Maximum | 2835 |
| Range | 2835 |
| Interquartile range | 65.1 |
Descriptive statistics
| Standard deviation | 63.656 |
|---|---|
| Coef of variation | 1.0995 |
| Kurtosis | 22.028 |
| Mean | 57.896 |
| MAD | 45.406 |
| Skewness | 2.5419 |
| Sum | 14474000 |
| Variance | 4052 |
| Memory size | 1.9 MiB |
| Value | Count | Frequency (%) | |
| 0.0 | 41 | 0.0% |
|
| 1.308 | 25 | 0.0% |
|
| 0.778 | 25 | 0.0% |
|
| 1.8430000000000002 | 25 | 0.0% |
|
| 0.6970000000000001 | 25 | 0.0% |
|
| 0.763 | 24 | 0.0% |
|
| 1.319 | 23 | 0.0% |
|
| 1.094 | 23 | 0.0% |
|
| 1.4340000000000002 | 23 | 0.0% |
|
| 0.654 | 22 | 0.0% |
|
| Other values (115553) | 249744 | 99.9% |
|
Minimum 5 values
| Value | Count | Frequency (%) | |
| 0.0 | 41 | 0.0% |
|
| 0.005 | 1 | 0.0% |
|
| 0.011000000000000001 | 1 | 0.0% |
|
| 0.012 | 2 | 0.0% |
|
| 0.013999999999999999 | 2 | 0.0% |
|
Maximum 5 values
| Value | Count | Frequency (%) | |
| 734.2769999999999 | 1 | 0.0% |
|
| 753.745 | 1 | 0.0% |
|
| 762.806 | 1 | 0.0% |
|
| 1053.807 | 1 | 0.0% |
|
| 2834.9990000000003 | 1 | 0.0% |
|
DER_pt_ratio_lep_tau
Numeric
| Distinct count | 5931 |
|---|---|
| Unique (%) | 2.4% |
| Missing (%) | 0.0% |
| Missing (n) | 0 |
| Infinite (%) | 0.0% |
| Infinite (n) | 0 |
| Mean | 1.4376 |
|---|---|
| Minimum | 0.047 |
| Maximum | 19.773 |
| Zeros (%) | 0.0% |
Quantile statistics
| Minimum | 0.047 |
|---|---|
| 5-th percentile | 0.488 |
| Q1 | 0.883 |
| Median | 1.28 |
| Q3 | 1.777 |
| 95-th percentile | 2.897 |
| Maximum | 19.773 |
| Range | 19.726 |
| Interquartile range | 0.894 |
Descriptive statistics
| Standard deviation | 0.84474 |
|---|---|
| Coef of variation | 0.5876 |
| Kurtosis | 18.297 |
| Mean | 1.4376 |
| MAD | 0.59245 |
| Skewness | 2.6335 |
| Sum | 359400 |
| Variance | 0.71359 |
| Memory size | 1.9 MiB |
| Value | Count | Frequency (%) | |
| 0.9009999999999999 | 206 | 0.1% |
|
| 1.128 | 198 | 0.1% |
|
| 1.232 | 197 | 0.1% |
|
| 0.9540000000000001 | 196 | 0.1% |
|
| 1.249 | 195 | 0.1% |
|
| 1.2830000000000001 | 193 | 0.1% |
|
| 1.149 | 187 | 0.1% |
|
| 1.2670000000000001 | 185 | 0.1% |
|
| 1.198 | 184 | 0.1% |
|
| 1.155 | 184 | 0.1% |
|
| Other values (5921) | 248075 | 99.2% |
|
Minimum 5 values
| Value | Count | Frequency (%) | |
| 0.047 | 1 | 0.0% |
|
| 0.07400000000000001 | 1 | 0.0% |
|
| 0.077 | 1 | 0.0% |
|
| 0.08 | 2 | 0.0% |
|
| 0.081 | 1 | 0.0% |
|
Maximum 5 values
| Value | Count | Frequency (%) | |
| 16.776 | 1 | 0.0% |
|
| 18.872 | 1 | 0.0% |
|
| 18.992 | 1 | 0.0% |
|
| 19.672 | 1 | 0.0% |
|
| 19.773 | 1 | 0.0% |
|
DER_pt_ratio_lep_tau_log
Highly correlated
This variable is highly correlated with DER_pt_ratio_lep_tau and should be ignored for analysis
| Correlation | 0.90512 |
|---|
DER_pt_tot
Numeric
| Distinct count | 59042 |
|---|---|
| Unique (%) | 23.6% |
| Missing (%) | 0.0% |
| Missing (n) | 0 |
| Infinite (%) | 0.0% |
| Infinite (n) | 0 |
| Mean | 18.917 |
|---|---|
| Minimum | 0 |
| Maximum | 2835 |
| Zeros (%) | 0.0% |
Quantile statistics
| Minimum | 0 |
|---|---|
| 5-th percentile | 0.733 |
| Q1 | 2.841 |
| Median | 12.316 |
| Q3 | 27.591 |
| 95-th percentile | 56.689 |
| Maximum | 2835 |
| Range | 2835 |
| Interquartile range | 24.75 |
Descriptive statistics
| Standard deviation | 22.273 |
|---|---|
| Coef of variation | 1.1774 |
| Kurtosis | 1036.5 |
| Mean | 18.917 |
| MAD | 15.649 |
| Skewness | 10.579 |
| Sum | 4729300 |
| Variance | 496.11 |
| Memory size | 1.9 MiB |
| Value | Count | Frequency (%) | |
| 1.072 | 44 | 0.0% |
|
| 0.9640000000000001 | 43 | 0.0% |
|
| 1.2819999999999998 | 43 | 0.0% |
|
| 0.851 | 41 | 0.0% |
|
| 1.308 | 41 | 0.0% |
|
| 1.5030000000000001 | 40 | 0.0% |
|
| 1.693 | 40 | 0.0% |
|
| 0.892 | 40 | 0.0% |
|
| 1.26 | 40 | 0.0% |
|
| 1.3619999999999999 | 39 | 0.0% |
|
| Other values (59032) | 249589 | 99.8% |
|
Minimum 5 values
| Value | Count | Frequency (%) | |
| 0.0 | 39 | 0.0% |
|
| 0.001 | 2 | 0.0% |
|
| 0.003 | 1 | 0.0% |
|
| 0.004 | 2 | 0.0% |
|
| 0.005 | 1 | 0.0% |
|
Maximum 5 values
| Value | Count | Frequency (%) | |
| 372.721 | 1 | 0.0% |
|
| 403.195 | 1 | 0.0% |
|
| 466.525 | 1 | 0.0% |
|
| 513.659 | 1 | 0.0% |
|
| 2834.9990000000003 | 1 | 0.0% |
|
DER_sum_pt
Numeric
| Distinct count | 156098 |
|---|---|
| Unique (%) | 62.4% |
| Missing (%) | 0.0% |
| Missing (n) | 0 |
| Infinite (%) | 0.0% |
| Infinite (n) | 0 |
| Mean | 158.43 |
|---|---|
| Minimum | 46.104 |
| Maximum | 1852.5 |
| Zeros (%) | 0.0% |
Quantile statistics
| Minimum | 46.104 |
|---|---|
| 5-th percentile | 55.973 |
| Q1 | 77.55 |
| Median | 120.66 |
| Q3 | 200.48 |
| 95-th percentile | 383 |
| Maximum | 1852.5 |
| Range | 1806.4 |
| Interquartile range | 122.93 |
Descriptive statistics
| Standard deviation | 115.71 |
|---|---|
| Coef of variation | 0.73032 |
| Kurtosis | 8.8372 |
| Mean | 158.43 |
| MAD | 83.451 |
| Skewness | 2.3206 |
| Sum | 39608000 |
| Variance | 13388 |
| Memory size | 1.9 MiB |
| Value | Count | Frequency (%) | |
| 76.854 | 13 | 0.0% |
|
| 68.48100000000001 | 12 | 0.0% |
|
| 64.523 | 11 | 0.0% |
|
| 66.18 | 10 | 0.0% |
|
| 69.8 | 10 | 0.0% |
|
| 64.673 | 10 | 0.0% |
|
| 77.59 | 9 | 0.0% |
|
| 69.54 | 9 | 0.0% |
|
| 63.293 | 9 | 0.0% |
|
| 75.976 | 9 | 0.0% |
|
| Other values (156088) | 249898 | 100.0% |
|
Minimum 5 values
| Value | Count | Frequency (%) | |
| 46.104 | 1 | 0.0% |
|
| 46.211999999999996 | 1 | 0.0% |
|
| 46.227 | 1 | 0.0% |
|
| 46.229 | 1 | 0.0% |
|
| 46.25 | 1 | 0.0% |
|
Maximum 5 values
| Value | Count | Frequency (%) | |
| 1558.993 | 1 | 0.0% |
|
| 1675.4489999999998 | 1 | 0.0% |
|
| 1687.0870000000002 | 1 | 0.0% |
|
| 1703.7520000000002 | 1 | 0.0% |
|
| 1852.4620000000002 | 1 | 0.0% |
|
EventId
Numeric
| Distinct count | 250000 |
|---|---|
| Unique (%) | 100.0% |
| Missing (%) | 0.0% |
| Missing (n) | 0 |
| Infinite (%) | 0.0% |
| Infinite (n) | 0 |
| Mean | 225000 |
|---|---|
| Minimum | 100000 |
| Maximum | 349999 |
| Zeros (%) | 0.0% |
Quantile statistics
| Minimum | 100000 |
|---|---|
| 5-th percentile | 112500 |
| Q1 | 162500 |
| Median | 225000 |
| Q3 | 287500 |
| 95-th percentile | 337500 |
| Maximum | 349999 |
| Range | 249999 |
| Interquartile range | 125000 |
Descriptive statistics
| Standard deviation | 72169 |
|---|---|
| Coef of variation | 0.32075 |
| Kurtosis | -1.2 |
| Mean | 225000 |
| MAD | 62500 |
| Skewness | 0 |
| Sum | 56249875000 |
| Variance | 5208400000 |
| Memory size | 1.9 MiB |
| Value | Count | Frequency (%) | |
| 100303 | 1 | 0.0% |
|
| 142096 | 1 | 0.0% |
|
| 187150 | 1 | 0.0% |
|
| 181005 | 1 | 0.0% |
|
| 183052 | 1 | 0.0% |
|
| 193291 | 1 | 0.0% |
|
| 195338 | 1 | 0.0% |
|
| 189193 | 1 | 0.0% |
|
| 191240 | 1 | 0.0% |
|
| 168711 | 1 | 0.0% |
|
| Other values (249990) | 249990 | 100.0% |
|
Minimum 5 values
| Value | Count | Frequency (%) | |
| 100000 | 1 | 0.0% |
|
| 100001 | 1 | 0.0% |
|
| 100002 | 1 | 0.0% |
|
| 100003 | 1 | 0.0% |
|
| 100004 | 1 | 0.0% |
|
Maximum 5 values
| Value | Count | Frequency (%) | |
| 349995 | 1 | 0.0% |
|
| 349996 | 1 | 0.0% |
|
| 349997 | 1 | 0.0% |
|
| 349998 | 1 | 0.0% |
|
| 349999 | 1 | 0.0% |
|
Label
Categorical
| Distinct count | 2 |
|---|---|
| Unique (%) | 0.0% |
| Missing (%) | 0.0% |
| Missing (n) | 0 |
| b |
164333
|
|---|---|
| s |
85667
|
| Value | Count | Frequency (%) | |
| b | 164333 | 65.7% |
|
| s | 85667 | 34.3% |
|
PRI_jet_all_pt
Highly correlated
This variable is highly correlated with DER_sum_pt and should be ignored for analysis
| Correlation | 0.96563 |
|---|
PRI_jet_leading_eta
Highly correlated
This variable is highly correlated with PRI_jet_leading_pt and should be ignored for analysis
| Correlation | 0.9961 |
|---|
PRI_jet_leading_phi
Highly correlated
This variable is highly correlated with PRI_jet_leading_eta and should be ignored for analysis
| Correlation | 0.99999 |
|---|
PRI_jet_leading_pt
Numeric
| Distinct count | 86590 |
|---|---|
| Unique (%) | 34.6% |
| Missing (%) | 0.0% |
| Missing (n) | 0 |
| Infinite (%) | 0.0% |
| Infinite (n) | 0 |
| Mean | -348.33 |
|---|---|
| Minimum | -999 |
| Maximum | 1120.6 |
| Zeros (%) | 0.0% |
Quantile statistics
| Minimum | -999 |
|---|---|
| 5-th percentile | -999 |
| Q1 | -999 |
| Median | 38.96 |
| Q3 | 75.349 |
| 95-th percentile | 169.46 |
| Maximum | 1120.6 |
| Range | 2119.6 |
| Interquartile range | 1074.3 |
Descriptive statistics
| Standard deviation | 532.96 |
|---|---|
| Coef of variation | -1.5301 |
| Kurtosis | -1.8107 |
| Mean | -348.33 |
| MAD | 520.08 |
| Skewness | -0.38373 |
| Sum | -87082000 |
| Variance | 284050 |
| Memory size | 1.9 MiB |
| Value | Count | Frequency (%) | |
| -999.0 | 99913 | 40.0% |
|
| 40.089 | 10 | 0.0% |
|
| 36.493 | 10 | 0.0% |
|
| 30.763 | 10 | 0.0% |
|
| 36.358000000000004 | 10 | 0.0% |
|
| 30.363000000000003 | 10 | 0.0% |
|
| 31.739 | 10 | 0.0% |
|
| 34.184 | 10 | 0.0% |
|
| 35.656 | 9 | 0.0% |
|
| 34.275 | 9 | 0.0% |
|
| Other values (86580) | 149999 | 60.0% |
|
Minimum 5 values
| Value | Count | Frequency (%) | |
| -999.0 | 99913 | 40.0% |
|
| 30.0 | 3 | 0.0% |
|
| 30.000999999999998 | 2 | 0.0% |
|
| 30.002 | 6 | 0.0% |
|
| 30.003 | 5 | 0.0% |
|
Maximum 5 values
| Value | Count | Frequency (%) | |
| 738.235 | 1 | 0.0% |
|
| 743.222 | 1 | 0.0% |
|
| 755.235 | 1 | 0.0% |
|
| 760.846 | 1 | 0.0% |
|
| 1120.573 | 1 | 0.0% |
|
PRI_jet_num
Numeric
| Distinct count | 4 |
|---|---|
| Unique (%) | 0.0% |
| Missing (%) | 0.0% |
| Missing (n) | 0 |
| Infinite (%) | 0.0% |
| Infinite (n) | 0 |
| Mean | 0.97918 |
|---|---|
| Minimum | 0 |
| Maximum | 3 |
| Zeros (%) | 40.0% |
Quantile statistics
| Minimum | 0 |
|---|---|
| 5-th percentile | 0 |
| Q1 | 0 |
| Median | 1 |
| Q3 | 2 |
| 95-th percentile | 3 |
| Maximum | 3 |
| Range | 3 |
| Interquartile range | 2 |
Descriptive statistics
| Standard deviation | 0.97743 |
|---|---|
| Coef of variation | 0.99821 |
| Kurtosis | -0.7378 |
| Mean | 0.97918 |
| MAD | 0.78266 |
| Skewness | 0.61128 |
| Sum | 244794 |
| Variance | 0.95536 |
| Memory size | 1.9 MiB |
| Value | Count | Frequency (%) | |
| 0 | 99913 | 40.0% |
|
| 1 | 77544 | 31.0% |
|
| 2 | 50379 | 20.2% |
|
| 3 | 22164 | 8.9% |
|
Minimum 5 values
| Value | Count | Frequency (%) | |
| 0 | 99913 | 40.0% |
|
| 1 | 77544 | 31.0% |
|
| 2 | 50379 | 20.2% |
|
| 3 | 22164 | 8.9% |
|
Maximum 5 values
| Value | Count | Frequency (%) | |
| 0 | 99913 | 40.0% |
|
| 1 | 77544 | 31.0% |
|
| 2 | 50379 | 20.2% |
|
| 3 | 22164 | 8.9% |
|
PRI_jet_subleading_eta
Highly correlated
This variable is highly correlated with PRI_jet_subleading_pt and should be ignored for analysis
| Correlation | 0.99935 |
|---|
PRI_jet_subleading_phi
Highly correlated
This variable is highly correlated with PRI_jet_subleading_eta and should be ignored for analysis
| Correlation | 0.99999 |
|---|
PRI_jet_subleading_pt
Highly correlated
This variable is highly correlated with DER_lep_eta_centrality and should be ignored for analysis
| Correlation | 0.99935 |
|---|
PRI_lep_eta
Numeric
| Distinct count | 4987 |
|---|---|
| Unique (%) | 2.0% |
| Missing (%) | 0.0% |
| Missing (n) | 0 |
| Infinite (%) | 0.0% |
| Infinite (n) | 0 |
| Mean | -0.019507 |
|---|---|
| Minimum | -2.505 |
| Maximum | 2.503 |
| Zeros (%) | 0.0% |
Quantile statistics
| Minimum | -2.505 |
|---|---|
| 5-th percentile | -2.072 |
| Q1 | -1.014 |
| Median | -0.045 |
| Q3 | 0.959 |
| 95-th percentile | 2.066 |
| Maximum | 2.503 |
| Range | 5.008 |
| Interquartile range | 1.973 |
Descriptive statistics
| Standard deviation | 1.265 |
|---|---|
| Coef of variation | -64.846 |
| Kurtosis | -0.95698 |
| Mean | -0.019507 |
| MAD | 1.0698 |
| Skewness | 0.021623 |
| Sum | -4876.9 |
| Variance | 1.6002 |
| Memory size | 1.9 MiB |
| Value | Count | Frequency (%) | |
| 0.307 | 106 | 0.0% |
|
| -0.20600000000000002 | 105 | 0.0% |
|
| 0.392 | 95 | 0.0% |
|
| 0.364 | 94 | 0.0% |
|
| -0.629 | 94 | 0.0% |
|
| 0.524 | 92 | 0.0% |
|
| -0.158 | 91 | 0.0% |
|
| 0.335 | 90 | 0.0% |
|
| -0.644 | 90 | 0.0% |
|
| 0.34 | 90 | 0.0% |
|
| Other values (4977) | 249053 | 99.6% |
|
Minimum 5 values
| Value | Count | Frequency (%) | |
| -2.505 | 1 | 0.0% |
|
| -2.494 | 1 | 0.0% |
|
| -2.49 | 3 | 0.0% |
|
| -2.489 | 2 | 0.0% |
|
| -2.487 | 2 | 0.0% |
|
Maximum 5 values
| Value | Count | Frequency (%) | |
| 2.496 | 1 | 0.0% |
|
| 2.497 | 1 | 0.0% |
|
| 2.499 | 1 | 0.0% |
|
| 2.502 | 1 | 0.0% |
|
| 2.503 | 1 | 0.0% |
|
PRI_lep_phi
Numeric
| Distinct count | 6285 |
|---|---|
| Unique (%) | 2.5% |
| Missing (%) | 0.0% |
| Missing (n) | 0 |
| Infinite (%) | 0.0% |
| Infinite (n) | 0 |
| Mean | 0.043543 |
|---|---|
| Minimum | -3.142 |
| Maximum | 3.142 |
| Zeros (%) | 0.0% |
Quantile statistics
| Minimum | -3.142 |
|---|---|
| 5-th percentile | -2.834 |
| Q1 | -1.522 |
| Median | 0.086 |
| Q3 | 1.618 |
| 95-th percentile | 2.838 |
| Maximum | 3.142 |
| Range | 6.284 |
| Interquartile range | 3.14 |
Descriptive statistics
| Standard deviation | 1.8166 |
|---|---|
| Coef of variation | 41.72 |
| Kurtosis | -1.1901 |
| Mean | 0.043543 |
| MAD | 1.5694 |
| Skewness | -0.045746 |
| Sum | 10886 |
| Variance | 3.3001 |
| Memory size | 1.9 MiB |
| Value | Count | Frequency (%) | |
| 3.05 | 64 | 0.0% |
|
| 1.9169999999999998 | 64 | 0.0% |
|
| 1.6059999999999999 | 64 | 0.0% |
|
| -0.171 | 63 | 0.0% |
|
| 2.189 | 62 | 0.0% |
|
| 0.207 | 62 | 0.0% |
|
| 0.687 | 62 | 0.0% |
|
| 0.086 | 62 | 0.0% |
|
| 0.948 | 62 | 0.0% |
|
| 1.7619999999999998 | 62 | 0.0% |
|
| Other values (6275) | 249373 | 99.7% |
|
Minimum 5 values
| Value | Count | Frequency (%) | |
| -3.142 | 9 | 0.0% |
|
| -3.141 | 44 | 0.0% |
|
| -3.14 | 41 | 0.0% |
|
| -3.139 | 40 | 0.0% |
|
| -3.138 | 40 | 0.0% |
|
Maximum 5 values
| Value | Count | Frequency (%) | |
| 3.138 | 34 | 0.0% |
|
| 3.139 | 42 | 0.0% |
|
| 3.14 | 36 | 0.0% |
|
| 3.141 | 40 | 0.0% |
|
| 3.142 | 2 | 0.0% |
|
PRI_lep_pt
Numeric
| Distinct count | 61929 |
|---|---|
| Unique (%) | 24.8% |
| Missing (%) | 0.0% |
| Missing (n) | 0 |
| Infinite (%) | 0.0% |
| Infinite (n) | 0 |
| Mean | 46.66 |
|---|---|
| Minimum | 26 |
| Maximum | 560.27 |
| Zeros (%) | 0.0% |
Quantile statistics
| Minimum | 26 |
|---|---|
| 5-th percentile | 27.213 |
| Q1 | 32.375 |
| Median | 40.516 |
| Q3 | 53.39 |
| 95-th percentile | 86.647 |
| Maximum | 560.27 |
| Range | 534.27 |
| Interquartile range | 21.015 |
Descriptive statistics
| Standard deviation | 22.065 |
|---|---|
| Coef of variation | 0.47289 |
| Kurtosis | 21.583 |
| Mean | 46.66 |
| MAD | 14.882 |
| Skewness | 3.2408 |
| Sum | 11665000 |
| Variance | 486.86 |
| Memory size | 1.9 MiB |
| Value | Count | Frequency (%) | |
| 30.037 | 22 | 0.0% |
|
| 28.855 | 22 | 0.0% |
|
| 28.58 | 22 | 0.0% |
|
| 28.329 | 22 | 0.0% |
|
| 34.902 | 21 | 0.0% |
|
| 32.806 | 21 | 0.0% |
|
| 30.416999999999998 | 20 | 0.0% |
|
| 30.987 | 20 | 0.0% |
|
| 32.924 | 20 | 0.0% |
|
| 28.815 | 20 | 0.0% |
|
| Other values (61919) | 249790 | 99.9% |
|
Minimum 5 values
| Value | Count | Frequency (%) | |
| 26.0 | 2 | 0.0% |
|
| 26.000999999999998 | 13 | 0.0% |
|
| 26.002 | 15 | 0.0% |
|
| 26.003 | 12 | 0.0% |
|
| 26.004 | 6 | 0.0% |
|
Maximum 5 values
| Value | Count | Frequency (%) | |
| 437.707 | 1 | 0.0% |
|
| 447.87800000000004 | 1 | 0.0% |
|
| 452.434 | 1 | 0.0% |
|
| 461.89599999999996 | 1 | 0.0% |
|
| 560.271 | 1 | 0.0% |
|
PRI_met
Numeric
| Distinct count | 87836 |
|---|---|
| Unique (%) | 35.1% |
| Missing (%) | 0.0% |
| Missing (n) | 0 |
| Infinite (%) | 0.0% |
| Infinite (n) | 0 |
| Mean | 41.717 |
|---|---|
| Minimum | 0.109 |
| Maximum | 2842.6 |
| Zeros (%) | 0.0% |
Quantile statistics
| Minimum | 0.109 |
|---|---|
| 5-th percentile | 8.4299 |
| Q1 | 21.398 |
| Median | 34.802 |
| Q3 | 51.895 |
| 95-th percentile | 99.891 |
| Maximum | 2842.6 |
| Range | 2842.5 |
| Interquartile range | 30.497 |
Descriptive statistics
| Standard deviation | 32.895 |
|---|---|
| Coef of variation | 0.78852 |
| Kurtosis | 227.34 |
| Mean | 41.717 |
| MAD | 21.786 |
| Skewness | 5.2708 |
| Sum | 10429000 |
| Variance | 1082.1 |
| Memory size | 1.9 MiB |
| Value | Count | Frequency (%) | |
| 31.252 | 15 | 0.0% |
|
| 31.701 | 14 | 0.0% |
|
| 30.498 | 13 | 0.0% |
|
| 29.685 | 13 | 0.0% |
|
| 33.951 | 13 | 0.0% |
|
| 32.442 | 13 | 0.0% |
|
| 25.933000000000003 | 13 | 0.0% |
|
| 26.642 | 13 | 0.0% |
|
| 27.022 | 13 | 0.0% |
|
| 33.143 | 13 | 0.0% |
|
| Other values (87826) | 249867 | 99.9% |
|
Minimum 5 values
| Value | Count | Frequency (%) | |
| 0.109 | 1 | 0.0% |
|
| 0.155 | 1 | 0.0% |
|
| 0.162 | 1 | 0.0% |
|
| 0.179 | 1 | 0.0% |
|
| 0.2 | 1 | 0.0% |
|
Maximum 5 values
| Value | Count | Frequency (%) | |
| 551.06 | 1 | 0.0% |
|
| 593.237 | 1 | 0.0% |
|
| 695.533 | 1 | 0.0% |
|
| 951.363 | 1 | 0.0% |
|
| 2842.617 | 1 | 0.0% |
|
PRI_met_log
Numeric
| Distinct count | 87836 |
|---|---|
| Unique (%) | 35.1% |
| Missing (%) | 0.0% |
| Missing (n) | 0 |
| Infinite (%) | 0.0% |
| Infinite (n) | 0 |
| Mean | 3.4751 |
|---|---|
| Minimum | -2.2164 |
| Maximum | 7.9525 |
| Zeros (%) | 0.0% |
Quantile statistics
| Minimum | -2.2164 |
|---|---|
| 5-th percentile | 2.1318 |
| Q1 | 3.0633 |
| Median | 3.5497 |
| Q3 | 3.9492 |
| 95-th percentile | 4.6041 |
| Maximum | 7.9525 |
| Range | 10.169 |
| Interquartile range | 0.88592 |
Descriptive statistics
| Standard deviation | 0.75452 |
|---|---|
| Coef of variation | 0.21712 |
| Kurtosis | 1.528 |
| Mean | 3.4751 |
| MAD | 0.57295 |
| Skewness | -0.6258 |
| Sum | 868780 |
| Variance | 0.5693 |
| Memory size | 1.9 MiB |
| Value | Count | Frequency (%) | |
| 3.442083374134498 | 15 | 0.0% |
|
| 3.4563482261270035 | 14 | 0.0% |
|
| 3.8526975393433314 | 13 | 0.0% |
|
| 3.4176611076928203 | 13 | 0.0% |
|
| 2.72831024010957 | 13 | 0.0% |
|
| 3.2966513490377674 | 13 | 0.0% |
|
| 3.3906418677226386 | 13 | 0.0% |
|
| 2.8927023732344472 | 13 | 0.0% |
|
| 3.342437686455233 | 13 | 0.0% |
|
| 3.2555162889631926 | 13 | 0.0% |
|
| Other values (87826) | 249867 | 99.9% |
|
Minimum 5 values
| Value | Count | Frequency (%) | |
| -2.2164073967529934 | 1 | 0.0% |
|
| -1.8643301620628905 | 1 | 0.0% |
|
| -1.820158943749753 | 1 | 0.0% |
|
| -1.720369473141382 | 1 | 0.0% |
|
| -1.6094379124341003 | 1 | 0.0% |
|
Maximum 5 values
| Value | Count | Frequency (%) | |
| 6.311843696146471 | 1 | 0.0% |
|
| 6.38559398188572 | 1 | 0.0% |
|
| 6.544678458117106 | 1 | 0.0% |
|
| 6.857895693185185 | 1 | 0.0% |
|
| 7.952480385727471 | 1 | 0.0% |
|
PRI_met_phi
Numeric
| Distinct count | 6285 |
|---|---|
| Unique (%) | 2.5% |
| Missing (%) | 0.0% |
| Missing (n) | 0 |
| Infinite (%) | 0.0% |
| Infinite (n) | 0 |
| Mean | -0.010119 |
|---|---|
| Minimum | -3.142 |
| Maximum | 3.142 |
| Zeros (%) | 0.0% |
Quantile statistics
| Minimum | -3.142 |
|---|---|
| 5-th percentile | -2.831 |
| Q1 | -1.575 |
| Median | -0.024 |
| Q3 | 1.561 |
| 95-th percentile | 2.823 |
| Maximum | 3.142 |
| Range | 6.284 |
| Interquartile range | 3.136 |
Descriptive statistics
| Standard deviation | 1.8122 |
|---|---|
| Coef of variation | -179.09 |
| Kurtosis | -1.1961 |
| Mean | -0.010119 |
| MAD | 1.5686 |
| Skewness | 0.0079078 |
| Sum | -2529.8 |
| Variance | 3.2842 |
| Memory size | 1.9 MiB |
| Value | Count | Frequency (%) | |
| -2.0909999999999997 | 66 | 0.0% |
|
| -2.063 | 65 | 0.0% |
|
| -2.911 | 64 | 0.0% |
|
| 1.6 | 64 | 0.0% |
|
| 0.516 | 63 | 0.0% |
|
| 2.4090000000000003 | 61 | 0.0% |
|
| 0.9309999999999999 | 61 | 0.0% |
|
| 0.263 | 61 | 0.0% |
|
| -3.113 | 61 | 0.0% |
|
| -0.642 | 60 | 0.0% |
|
| Other values (6275) | 249374 | 99.7% |
|
Minimum 5 values
| Value | Count | Frequency (%) | |
| -3.142 | 3 | 0.0% |
|
| -3.141 | 35 | 0.0% |
|
| -3.14 | 30 | 0.0% |
|
| -3.139 | 43 | 0.0% |
|
| -3.138 | 43 | 0.0% |
|
Maximum 5 values
| Value | Count | Frequency (%) | |
| 3.138 | 47 | 0.0% |
|
| 3.139 | 26 | 0.0% |
|
| 3.14 | 37 | 0.0% |
|
| 3.141 | 41 | 0.0% |
|
| 3.142 | 6 | 0.0% |
|
PRI_met_sumet
Highly correlated
This variable is highly correlated with DER_sum_pt and should be ignored for analysis
| Correlation | 0.90448 |
|---|
PRI_met_sumet_log
Highly correlated
This variable is highly correlated with PRI_met_sumet and should be ignored for analysis
| Correlation | 0.92121 |
|---|
PRI_tau_eta
Numeric
| Distinct count | 4971 |
|---|---|
| Unique (%) | 2.0% |
| Missing (%) | 0.0% |
| Missing (n) | 0 |
| Infinite (%) | 0.0% |
| Infinite (n) | 0 |
| Mean | -0.010973 |
|---|---|
| Minimum | -2.499 |
| Maximum | 2.497 |
| Zeros (%) | 0.0% |
Quantile statistics
| Minimum | -2.499 |
|---|---|
| 5-th percentile | -2.011 |
| Q1 | -0.925 |
| Median | -0.023 |
| Q3 | 0.898 |
| 95-th percentile | 2.011 |
| Maximum | 2.497 |
| Range | 4.996 |
| Interquartile range | 1.823 |
Descriptive statistics
| Standard deviation | 1.2141 |
|---|---|
| Coef of variation | -110.64 |
| Kurtosis | -0.84229 |
| Mean | -0.010973 |
| MAD | 1.0123 |
| Skewness | 0.017852 |
| Sum | -2743.3 |
| Variance | 1.474 |
| Memory size | 1.9 MiB |
| Value | Count | Frequency (%) | |
| 0.152 | 141 | 0.1% |
|
| -0.152 | 131 | 0.1% |
|
| -0.301 | 129 | 0.1% |
|
| -0.899 | 124 | 0.0% |
|
| 0.3 | 122 | 0.0% |
|
| 0.301 | 122 | 0.0% |
|
| -0.898 | 121 | 0.0% |
|
| 0.9009999999999999 | 121 | 0.0% |
|
| -0.9009999999999999 | 121 | 0.0% |
|
| 0.899 | 119 | 0.0% |
|
| Other values (4961) | 248749 | 99.5% |
|
Minimum 5 values
| Value | Count | Frequency (%) | |
| -2.499 | 1 | 0.0% |
|
| -2.498 | 3 | 0.0% |
|
| -2.497 | 1 | 0.0% |
|
| -2.496 | 2 | 0.0% |
|
| -2.495 | 3 | 0.0% |
|
Maximum 5 values
| Value | Count | Frequency (%) | |
| 2.492 | 4 | 0.0% |
|
| 2.4930000000000003 | 1 | 0.0% |
|
| 2.494 | 2 | 0.0% |
|
| 2.495 | 1 | 0.0% |
|
| 2.497 | 2 | 0.0% |
|
PRI_tau_phi
Numeric
| Distinct count | 6285 |
|---|---|
| Unique (%) | 2.5% |
| Missing (%) | 0.0% |
| Missing (n) | 0 |
| Infinite (%) | 0.0% |
| Infinite (n) | 0 |
| Mean | -0.0081711 |
|---|---|
| Minimum | -3.142 |
| Maximum | 3.142 |
| Zeros (%) | 0.0% |
Quantile statistics
| Minimum | -3.142 |
|---|---|
| 5-th percentile | -2.829 |
| Q1 | -1.575 |
| Median | -0.033 |
| Q3 | 1.565 |
| 95-th percentile | 2.83 |
| Maximum | 3.142 |
| Range | 6.284 |
| Interquartile range | 3.14 |
Descriptive statistics
| Standard deviation | 1.8168 |
|---|---|
| Coef of variation | -222.34 |
| Kurtosis | -1.2006 |
| Mean | -0.0081711 |
| MAD | 1.5736 |
| Skewness | 0.013872 |
| Sum | -2042.8 |
| Variance | 3.3006 |
| Memory size | 1.9 MiB |
| Value | Count | Frequency (%) | |
| -2.988 | 72 | 0.0% |
|
| -1.235 | 69 | 0.0% |
|
| -2.017 | 66 | 0.0% |
|
| -0.542 | 66 | 0.0% |
|
| -0.54 | 65 | 0.0% |
|
| -0.536 | 64 | 0.0% |
|
| -1.0290000000000001 | 64 | 0.0% |
|
| 1.215 | 63 | 0.0% |
|
| -1.247 | 62 | 0.0% |
|
| 2.594 | 62 | 0.0% |
|
| Other values (6275) | 249347 | 99.7% |
|
Minimum 5 values
| Value | Count | Frequency (%) | |
| -3.142 | 4 | 0.0% |
|
| -3.141 | 33 | 0.0% |
|
| -3.14 | 36 | 0.0% |
|
| -3.139 | 43 | 0.0% |
|
| -3.138 | 32 | 0.0% |
|
Maximum 5 values
| Value | Count | Frequency (%) | |
| 3.138 | 35 | 0.0% |
|
| 3.139 | 21 | 0.0% |
|
| 3.14 | 29 | 0.0% |
|
| 3.141 | 35 | 0.0% |
|
| 3.142 | 5 | 0.0% |
|
PRI_tau_pt
Numeric
| Distinct count | 59639 |
|---|---|
| Unique (%) | 23.9% |
| Missing (%) | 0.0% |
| Missing (n) | 0 |
| Infinite (%) | 0.0% |
| Infinite (n) | 0 |
| Mean | 38.707 |
|---|---|
| Minimum | 20 |
| Maximum | 764.41 |
| Zeros (%) | 0.0% |
Quantile statistics
| Minimum | 20 |
|---|---|
| 5-th percentile | 20.787 |
| Q1 | 24.592 |
| Median | 31.804 |
| Q3 | 45.017 |
| 95-th percentile | 77.882 |
| Maximum | 764.41 |
| Range | 744.41 |
| Interquartile range | 20.425 |
Descriptive statistics
| Standard deviation | 22.412 |
|---|---|
| Coef of variation | 0.57901 |
| Kurtosis | 30.512 |
| Mean | 38.707 |
| MAD | 14.799 |
| Skewness | 3.7552 |
| Sum | 9676900 |
| Variance | 502.3 |
| Memory size | 1.9 MiB |
| Value | Count | Frequency (%) | |
| 21.134 | 32 | 0.0% |
|
| 20.059 | 30 | 0.0% |
|
| 21.219 | 29 | 0.0% |
|
| 20.048 | 29 | 0.0% |
|
| 22.256999999999998 | 29 | 0.0% |
|
| 20.195 | 29 | 0.0% |
|
| 20.875999999999998 | 28 | 0.0% |
|
| 21.575 | 28 | 0.0% |
|
| 20.660999999999998 | 28 | 0.0% |
|
| 23.138 | 27 | 0.0% |
|
| Other values (59629) | 249711 | 99.9% |
|
Minimum 5 values
| Value | Count | Frequency (%) | |
| 20.0 | 17 | 0.0% |
|
| 20.000999999999998 | 15 | 0.0% |
|
| 20.002 | 15 | 0.0% |
|
| 20.003 | 17 | 0.0% |
|
| 20.004 | 19 | 0.0% |
|
Maximum 5 values
| Value | Count | Frequency (%) | |
| 415.985 | 1 | 0.0% |
|
| 449.648 | 1 | 0.0% |
|
| 505.06 | 1 | 0.0% |
|
| 622.862 | 1 | 0.0% |
|
| 764.408 | 1 | 0.0% |
|
Weight
Numeric
| Distinct count | 104096 |
|---|---|
| Unique (%) | 41.6% |
| Missing (%) | 0.0% |
| Missing (n) | 0 |
| Infinite (%) | 0.0% |
| Infinite (n) | 0 |
| Mean | 1.6468 |
|---|---|
| Minimum | 0.0015019 |
| Maximum | 7.8225 |
| Zeros (%) | 0.0% |
Quantile statistics
| Minimum | 0.0015019 |
|---|---|
| 5-th percentile | 0.0015027 |
| Q1 | 0.018636 |
| Median | 1.1562 |
| Q3 | 2.4041 |
| 95-th percentile | 5.348 |
| Maximum | 7.8225 |
| Range | 7.821 |
| Interquartile range | 2.3855 |
Descriptive statistics
| Standard deviation | 1.8751 |
|---|---|
| Coef of variation | 1.1387 |
| Kurtosis | -0.1545 |
| Mean | 1.6468 |
| MAD | 1.5264 |
| Skewness | 0.99063 |
| Sum | 411690 |
| Variance | 3.516 |
| Memory size | 1.9 MiB |
| Value | Count | Frequency (%) | |
| 0.0015027048310099999 | 38552 | 15.4% |
|
| 0.018636116672000002 | 32352 | 12.9% |
|
| 1.68161144262 | 17259 | 6.9% |
|
| 0.7440562472300001 | 10351 | 4.1% |
|
| 0.0026533113373299996 | 7789 | 3.1% |
|
| 0.00150187015894 | 6974 | 2.8% |
|
| 1.4548484726800002 | 6818 | 2.7% |
|
| 0.07389912981499999 | 5051 | 2.0% |
|
| 0.0713571365583 | 4657 | 1.9% |
|
| 0.309795155685 | 4013 | 1.6% |
|
| Other values (104086) | 116184 | 46.5% |
|
Minimum 5 values
| Value | Count | Frequency (%) | |
| 0.00150187015894 | 6974 | 2.8% |
|
| 0.0015027048310099999 | 38552 | 15.4% |
|
| 0.0026533113373299996 | 7789 | 3.1% |
|
| 0.018636116672000002 | 32352 | 12.9% |
|
| 0.0640607773058 | 2998 | 1.2% |
|
Maximum 5 values
| Value | Count | Frequency (%) | |
| 7.769833369520001 | 1 | 0.0% |
|
| 7.805034958110001 | 1 | 0.0% |
|
| 7.817382808239999 | 1 | 0.0% |
|
| 7.821960707020001 | 1 | 0.0% |
|
| 7.82254254503 | 1 | 0.0% |
|
| EventId | DER_mass_MMC | DER_mass_transverse_met_lep | DER_mass_vis | DER_pt_h | DER_deltaeta_jet_jet | DER_mass_jet_jet | DER_prodeta_jet_jet | DER_deltar_tau_lep | DER_pt_tot | DER_sum_pt | DER_pt_ratio_lep_tau | DER_met_phi_centrality | DER_lep_eta_centrality | PRI_tau_pt | PRI_tau_eta | PRI_tau_phi | PRI_lep_pt | PRI_lep_eta | PRI_lep_phi | PRI_met | PRI_met_phi | PRI_met_sumet | PRI_jet_num | PRI_jet_leading_pt | PRI_jet_leading_eta | PRI_jet_leading_phi | PRI_jet_subleading_pt | PRI_jet_subleading_eta | PRI_jet_subleading_phi | PRI_jet_all_pt | Weight | Label | PRI_met_log | DER_mass_vis_log | DER_pt_ratio_lep_tau_log | PRI_met_sumet_log | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 100000 | 138.470000 | 51.655000 | 97.827000 | 27.980000 | 0.910000 | 124.711000 | 2.666000 | 3.064000 | 41.928000 | 197.760000 | 1.582000 | 1.396000 | 0.200000 | 32.638000 | 1.017000 | 0.381000 | 51.626000 | 2.273000 | -2.414000 | 16.824000 | -0.277000 | 258.733000 | 2 | 67.435000 | 2.150000 | 0.444000 | 46.062000 | 1.240000 | -2.475000 | 113.497000 | 0.002653 | s | 2.822806 | 4.583201 | 0.458690 | 5.555797 |
| 1 | 100001 | 160.937000 | 68.768000 | 103.235000 | 48.146000 | -999.000000 | -999.000000 | -999.000000 | 3.473000 | 2.078000 | 125.157000 | 0.879000 | 1.414000 | -999.000000 | 42.014000 | 2.039000 | -3.011000 | 36.918000 | 0.501000 | 0.103000 | 44.704000 | -1.916000 | 164.546000 | 1 | 46.226000 | 0.725000 | 1.158000 | -999.000000 | -999.000000 | -999.000000 | 46.226000 | 2.233584 | b | 3.800063 | 4.637008 | -0.128970 | 5.103190 |
| 2 | 100002 | -999.000000 | 162.172000 | 125.953000 | 35.635000 | -999.000000 | -999.000000 | -999.000000 | 3.148000 | 9.336000 | 197.814000 | 3.776000 | 1.414000 | -999.000000 | 32.154000 | -0.705000 | -2.093000 | 121.409000 | -0.953000 | 1.052000 | 54.283000 | -2.186000 | 260.414000 | 1 | 44.251000 | 2.053000 | -2.028000 | -999.000000 | -999.000000 | -999.000000 | 44.251000 | 2.347389 | b | 3.994211 | 4.835909 | 1.328665 | 5.562273 |
| 3 | 100003 | 143.905000 | 81.417000 | 80.943000 | 0.414000 | -999.000000 | -999.000000 | -999.000000 | 3.310000 | 0.414000 | 75.968000 | 2.354000 | -1.285000 | -999.000000 | 22.647000 | -1.655000 | 0.010000 | 53.321000 | -0.522000 | -3.100000 | 31.082000 | 0.060000 | 86.062000 | 0 | -999.000000 | -999.000000 | -999.000000 | -999.000000 | -999.000000 | -999.000000 | -0.000000 | 5.446378 | b | 3.436629 | 4.393745 | 0.856116 | 4.455068 |
| 4 | 100004 | 175.864000 | 16.915000 | 134.805000 | 16.405000 | -999.000000 | -999.000000 | -999.000000 | 3.891000 | 16.405000 | 57.983000 | 1.056000 | -1.385000 | -999.000000 | 28.209000 | -2.197000 | -2.231000 | 29.774000 | 0.798000 | 1.569000 | 2.723000 | -0.871000 | 53.131000 | 0 | -999.000000 | -999.000000 | -999.000000 | -999.000000 | -999.000000 | -999.000000 | 0.000000 | 6.245333 | b | 1.001734 | 4.903829 | 0.054488 | 3.972761 |
#histogram without classes for all the columns of train
hist = boson_dataset[boson_dataset.columns.difference(['EventId', 'Label','Weight'])].hist(figsize=(30,30))
# the entire dataset including the test
#result_df = pd.DataFrame(boson_dataset.append(boson_dataset_test))
#hist = result_df[result_df.columns.difference(['EventId', 'Label',''])].hist(figsize=(30,30))
boson_dataset_s = boson_dataset[boson_dataset.Label=='s']
boson_dataset_b = boson_dataset[boson_dataset.Label=='b']
print('Total: ' + str(boson_dataset.size))
print('Class S: ' + str(boson_dataset_s.size))
print('Class B: '+ str(boson_dataset_b.size))
Total: 8250000 Class S: 2827011 Class B: 5422989
hist_s = boson_dataset_s[boson_dataset_s.columns.difference(['EventId', 'Label','Weight'])].hist(figsize=(30,30))
hist_b = boson_dataset_b[boson_dataset_b.columns.difference(['EventId', 'Label','Weight'])].hist(figsize=(30,30))
for i, col in enumerate(boson_dataset.columns.difference(['EventId', 'Label','Weight'])):
plt.figure(i)
sns.distplot(boson_dataset[boson_dataset.Label=='s'][col])
sns.distplot(boson_dataset[boson_dataset.Label=='b'][col])
iris_dataset.head()
| Id | SepalLengthCm | SepalWidthCm | PetalLengthCm | PetalWidthCm | Species | |
|---|---|---|---|---|---|---|
| 0 | 1 | 5.100000 | 3.500000 | 1.400000 | 0.200000 | Iris-setosa |
| 1 | 2 | 4.900000 | 3.000000 | 1.400000 | 0.200000 | Iris-setosa |
| 2 | 3 | 4.700000 | 3.200000 | 1.300000 | 0.200000 | Iris-setosa |
| 3 | 4 | 4.600000 | 3.100000 | 1.500000 | 0.200000 | Iris-setosa |
| 4 | 5 | 5.000000 | 3.600000 | 1.400000 | 0.200000 | Iris-setosa |
g = sns.FacetGrid(iris_dataset, hue="Species",height=(6))
g = (g.map(plt.scatter, "SepalLengthCm", "SepalWidthCm").add_legend())
g = sns.FacetGrid(iris_dataset, hue="Species",height=(6))
g = (g.map(plt.scatter, "SepalLengthCm", "PetalWidthCm").add_legend())
g = sns.FacetGrid(iris_dataset, hue="Species",height=(6))
g = (g.map(plt.scatter, "SepalWidthCm", "PetalLengthCm").add_legend())
g = sns.FacetGrid(iris_dataset, hue="Species",height=(6))
g = (g.map(plt.scatter, "SepalWidthCm", "PetalWidthCm").add_legend())
g = sns.FacetGrid(iris_dataset, hue="Species",height=(6))
g = (g.map(plt.scatter, "SepalLengthCm", "PetalLengthCm").add_legend())
g = sns.FacetGrid(iris_dataset, hue="Species",height=(6))
g = (g.map(plt.scatter, "PetalLengthCm", "PetalWidthCm").add_legend())
##This block considers all variables and can be used iteratively to minimize re-running code
#We can limit the variables of interest if we want to
def Plot(varname,dataset):
f, (ax1,ax2) = plt.subplots(1, 2, figsize=(20,4))
ax1.set_title('Before Log for var:' + varname)
ax2.set_title('After Log for var:' + varname)
sns.distplot(dataset[varname].fillna(0),ax=ax1,fit=norm)
sns.distplot((np.log(dataset[varname]).fillna(0)),ax=ax2,fit=norm)
plt.show()
Plot('PRI_met',boson_dataset)
Plot('DER_mass_vis',boson_dataset)
Plot('DER_pt_ratio_lep_tau',boson_dataset)
Plot('PRI_met_sumet',boson_dataset)
boson_dataset['PRI_met_log'] = np.log(boson_dataset['PRI_met'])
boson_dataset['DER_mass_vis_log']= np.log(boson_dataset['DER_mass_vis'])
boson_dataset['DER_pt_ratio_lep_tau_log']= np.log(boson_dataset['DER_pt_ratio_lep_tau'])
boson_dataset['PRI_met_sumet_log']= np.log(boson_dataset['PRI_met_sumet'])
boson_dataset_sample = boson_dataset.groupby('Label').apply(lambda s: s.sample(2000)) #boson_dataset.sample(n=10)
g = sns.FacetGrid(boson_dataset_sample, hue="Label",height=(6))
g = (g.map(plt.scatter, "PRI_met_sumet", "DER_pt_ratio_lep_tau").add_legend())
g = sns.FacetGrid(boson_dataset_sample, hue="Label",height=(6))
g = (g.map(plt.scatter, "PRI_met_sumet_log", "DER_pt_ratio_lep_tau_log").add_legend())
#Additional
g = sns.FacetGrid(boson_dataset_sample, hue="Label",height=(6))
g = (g.map(plt.scatter, "PRI_met", "DER_mass_vis").add_legend())
g = sns.FacetGrid(boson_dataset_sample, hue="Label",height=(6))
g = (g.map(plt.scatter, "PRI_met_log", "DER_mass_vis_log").add_legend())
#loading csv
digit_recognizer = pd.read_csv('/Users/rajmati.marlecha/Desktop/DMGAssignment/digit-recognizer/train.csv')
digit_recognizer.shape
(42000, 785)
digitArray = digit_recognizer.label.unique()
digitArray.sort()
digitArray
array([0, 1, 2, 3, 4, 5, 6, 7, 8, 9])
def twoDPCA(clas):
# PCA with 2 components.
pca = PCA(n_components = 2)
#print(pca)
#Fitting PCA to the iris dataset and transforming it into 2 principal components
X, y = digit_recognizer[digit_recognizer.label==clas].iloc[:, 1:].values, digit_recognizer.iloc[:, 0].values
#Standardize the data first
from sklearn.preprocessing import StandardScaler
sc = StandardScaler()
X_std = sc.fit_transform(X)
X_proj = pca.fit_transform(X_std)
return X_proj
def plot2DPCA(digits):
f, (ax1,ax2,ax3) = plt.subplots(1,3, figsize=(20,4))
# Plotting the projected principal components and try to understand the data.
# c=y colors the scatter plot based on y (target)
X = twoDPCA(digits[0])
ax1.set_title('Scatter for 2D PCA for Digit :' + str(digits[0]))
ax1.scatter(X[:,0], X[:,1])
if digits.size > 2:
Y = twoDPCA(digits[1])
Z = twoDPCA(digits[2])
ax2.set_title('Scatter for 2D PCA for Digit :' + str(digits[1]))
ax3.set_title('Scatter for 2D PCA for Digit :' + str(digits[2]))
ax2.scatter(Y[:,0], Y[:,1])
ax3.scatter(Z[:,0], Z[:,1])
#plt.figure(figsize = (10,8))
#plt.colorbar()
plt.show()
for x in range(0,digitArray.size,3):
#print(x)
y=x+3
plot2DPCA(digitArray[x:y:])
* Looking at the above scatter plots of 2D PCA for digits we can see
* 1, 4 ,7 , 9 are the digits that have more spread as compared to the other digits
* We can observe this by looking at the range of the values of the PCA components
* We can deduce that there is a higher spread amongst the numbers that can be written in multiple ways
* People write these digits in different ways
pca = PCA(n_components = 30)
X = digit_recognizer.iloc[:, 1:785].values
DIGITS_PCA_30_dataset_array = pca.fit_transform(X)
DIGITS_PCA_30_dataset =pd.DataFrame(DIGITS_PCA_30_dataset_array)
DIGITS_PCA_30_dataset.head()
| 0 | 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | 10 | 11 | 12 | 13 | 14 | 15 | 16 | 17 | 18 | 19 | 20 | 21 | 22 | 23 | 24 | 25 | 26 | 27 | 28 | 29 | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | -661.595780 | -699.311329 | 183.282042 | 120.611398 | -81.081824 | 489.461847 | -683.470895 | 85.559497 | 348.548292 | 202.980240 | -364.553391 | 21.264995 | 404.477151 | -97.048964 | 61.841956 | -86.806832 | 17.566938 | 285.478027 | 18.489390 | 207.299992 | 44.105701 | 222.279285 | 56.833039 | -150.908486 | 13.615222 | -39.822421 | -329.048320 | 209.887501 | 53.774265 | 85.084384 |
| 1 | 1701.451685 | -360.551556 | -501.805593 | 335.423654 | -442.378931 | 738.404042 | 653.875432 | -176.600386 | -7.520126 | 67.845959 | 34.221877 | 46.551135 | -70.435414 | -342.688616 | 377.844994 | -5.674843 | 317.738249 | 87.597697 | -94.560900 | -175.011465 | -213.072028 | -272.313867 | 6.882494 | -22.581181 | -34.636878 | 264.347635 | -75.546643 | 14.564312 | -83.783591 | -89.918589 |
| 2 | -886.894434 | -293.765783 | 67.155311 | 78.263766 | -473.715929 | -323.540652 | 437.799060 | -305.377773 | -195.295404 | -25.625397 | 367.599003 | 252.106279 | 54.574054 | -58.590929 | 163.088229 | -162.499168 | -100.321591 | -155.356230 | 70.093155 | -182.973527 | -104.998611 | 128.702540 | 152.083811 | 51.397146 | -113.140441 | 89.196840 | -228.414856 | 107.748696 | -10.485223 | 123.854704 |
| 3 | -165.755602 | 300.182762 | -64.145486 | 759.706252 | -425.844359 | 157.390367 | -304.099073 | 276.409517 | -45.867386 | -295.481490 | -2.753945 | -256.887521 | -87.784179 | -175.353144 | 40.048777 | -87.516544 | 54.449475 | 199.835489 | -11.539386 | 298.295407 | -232.006594 | -90.413399 | 293.350033 | 128.015966 | -67.282002 | -195.306691 | -189.473525 | 82.536451 | 221.288171 | 196.478324 |
| 4 | 1923.709716 | -449.153070 | -548.613023 | 188.555150 | -651.736270 | 990.063828 | 564.507103 | -255.915507 | 124.914695 | 177.567891 | -19.491802 | 333.725465 | -213.053366 | -354.650122 | 93.474844 | 26.795321 | 232.924581 | -54.718908 | -45.238239 | -256.050012 | -15.475114 | -131.242623 | -140.946481 | -56.700300 | 175.366804 | -8.765334 | 50.303635 | -164.771254 | -67.505413 | 17.474002 |
DIGITS_PCA_30_dataset['label']=digit_recognizer['label']
print(DIGITS_PCA_30_dataset.shape)
DIGITS_PCA_30_dataset.head()
(42000, 31)
| 0 | 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | 10 | 11 | 12 | 13 | 14 | 15 | 16 | 17 | 18 | 19 | 20 | 21 | 22 | 23 | 24 | 25 | 26 | 27 | 28 | 29 | label | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | -661.595780 | -699.311329 | 183.282042 | 120.611398 | -81.081824 | 489.461847 | -683.470895 | 85.559497 | 348.548292 | 202.980240 | -364.553391 | 21.264995 | 404.477151 | -97.048964 | 61.841956 | -86.806832 | 17.566938 | 285.478027 | 18.489390 | 207.299992 | 44.105701 | 222.279285 | 56.833039 | -150.908486 | 13.615222 | -39.822421 | -329.048320 | 209.887501 | 53.774265 | 85.084384 | 1 |
| 1 | 1701.451685 | -360.551556 | -501.805593 | 335.423654 | -442.378931 | 738.404042 | 653.875432 | -176.600386 | -7.520126 | 67.845959 | 34.221877 | 46.551135 | -70.435414 | -342.688616 | 377.844994 | -5.674843 | 317.738249 | 87.597697 | -94.560900 | -175.011465 | -213.072028 | -272.313867 | 6.882494 | -22.581181 | -34.636878 | 264.347635 | -75.546643 | 14.564312 | -83.783591 | -89.918589 | 0 |
| 2 | -886.894434 | -293.765783 | 67.155311 | 78.263766 | -473.715929 | -323.540652 | 437.799060 | -305.377773 | -195.295404 | -25.625397 | 367.599003 | 252.106279 | 54.574054 | -58.590929 | 163.088229 | -162.499168 | -100.321591 | -155.356230 | 70.093155 | -182.973527 | -104.998611 | 128.702540 | 152.083811 | 51.397146 | -113.140441 | 89.196840 | -228.414856 | 107.748696 | -10.485223 | 123.854704 | 1 |
| 3 | -165.755602 | 300.182762 | -64.145486 | 759.706252 | -425.844359 | 157.390367 | -304.099073 | 276.409517 | -45.867386 | -295.481490 | -2.753945 | -256.887521 | -87.784179 | -175.353144 | 40.048777 | -87.516544 | 54.449475 | 199.835489 | -11.539386 | 298.295407 | -232.006594 | -90.413399 | 293.350033 | 128.015966 | -67.282002 | -195.306691 | -189.473525 | 82.536451 | 221.288171 | 196.478324 | 4 |
| 4 | 1923.709716 | -449.153070 | -548.613023 | 188.555150 | -651.736270 | 990.063828 | 564.507103 | -255.915507 | 124.914695 | 177.567891 | -19.491802 | 333.725465 | -213.053366 | -354.650122 | 93.474844 | 26.795321 | 232.924581 | -54.718908 | -45.238239 | -256.050012 | -15.475114 | -131.242623 | -140.946481 | -56.700300 | 175.366804 | -8.765334 | 50.303635 | -164.771254 | -67.505413 | 17.474002 | 0 |
cluster_centers_class0 = DIGITS_PCA_30_dataset[DIGITS_PCA_30_dataset.label==0].iloc[:, 0:30].sample(n=10).reset_index(drop=True)
cluster_centers_class1 = DIGITS_PCA_30_dataset[DIGITS_PCA_30_dataset.label==1].iloc[:, 0:30].sample(n=10).reset_index(drop=True)
cluster_centers_class2 = DIGITS_PCA_30_dataset[DIGITS_PCA_30_dataset.label==2].iloc[:, 0:30].sample(n=10).reset_index(drop=True)
cluster_centers_class3 = DIGITS_PCA_30_dataset[DIGITS_PCA_30_dataset.label==3].iloc[:, 0:30].sample(n=10).reset_index(drop=True)
cluster_centers_class4 = DIGITS_PCA_30_dataset[DIGITS_PCA_30_dataset.label==4].iloc[:, 0:30].sample(n=10).reset_index(drop=True)
cluster_centers_class5 = DIGITS_PCA_30_dataset[DIGITS_PCA_30_dataset.label==5].iloc[:, 0:30].sample(n=10).reset_index(drop=True)
cluster_centers_class6 = DIGITS_PCA_30_dataset[DIGITS_PCA_30_dataset.label==6].iloc[:, 0:30].sample(n=10).reset_index(drop=True)
cluster_centers_class7 = DIGITS_PCA_30_dataset[DIGITS_PCA_30_dataset.label==7].iloc[:, 0:30].sample(n=10).reset_index(drop=True)
cluster_centers_class8 = DIGITS_PCA_30_dataset[DIGITS_PCA_30_dataset.label==8].iloc[:, 0:30].sample(n=10).reset_index(drop=True)
cluster_centers_class9 = DIGITS_PCA_30_dataset[DIGITS_PCA_30_dataset.label==9].iloc[:, 0:30].sample(n=10).reset_index(drop=True)
cluster_centers_class0.head(10)
| 0 | 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | 10 | 11 | 12 | 13 | 14 | 15 | 16 | 17 | 18 | 19 | 20 | 21 | 22 | 23 | 24 | 25 | 26 | 27 | 28 | 29 | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 947.714147 | -241.117585 | -276.651909 | -164.271022 | 358.450078 | -623.386981 | 120.255554 | 213.249644 | -151.484938 | -217.194960 | -153.690008 | -746.294288 | 0.392494 | -643.950065 | 324.863905 | 105.177959 | 54.993093 | 63.001149 | -29.029979 | -421.024782 | -46.797382 | -73.372633 | 157.372604 | 34.601213 | -46.919940 | -29.069742 | 58.820646 | 110.170843 | -181.334687 | 96.400874 |
| 1 | 892.724401 | -227.117228 | 366.709841 | -296.076736 | -864.594265 | -447.487888 | -189.315261 | -216.877476 | -273.944311 | 274.110988 | -111.994871 | 47.693260 | -291.542325 | 407.879375 | -16.560997 | -267.320933 | -191.918727 | 94.493796 | 248.629034 | -13.922736 | 249.338930 | 38.731225 | -142.432136 | 177.787839 | 248.477887 | 148.567440 | 211.392941 | 322.176976 | 146.838993 | 64.231047 |
| 2 | 953.747885 | -182.647088 | 405.948511 | 44.189688 | 128.215920 | -850.428660 | 562.533969 | 339.716086 | 352.122094 | -158.456284 | -497.796919 | -391.096326 | 1.887902 | -466.450363 | 299.357853 | -272.948311 | -1.053956 | 73.067006 | -114.459454 | -257.422169 | -62.119460 | 85.286100 | 228.414495 | 255.287583 | 102.761921 | 119.702934 | 147.251590 | -33.353525 | 9.892319 | 191.835267 |
| 3 | 679.167017 | 321.280739 | 237.504405 | 284.034557 | -606.111225 | -169.311497 | 543.699388 | -65.457137 | 340.330863 | 269.964491 | -412.085545 | 161.873325 | 409.234806 | 94.890698 | -337.128670 | -166.504060 | 98.153946 | -123.849749 | -461.199893 | -100.286838 | -215.061878 | -181.424115 | -465.788081 | 169.967503 | 32.481823 | -250.945527 | 41.076895 | 33.263503 | 222.716373 | 105.693883 |
| 4 | 970.423802 | -0.167939 | 489.131272 | -374.440208 | -920.072649 | -257.749796 | 155.345690 | -178.413534 | 164.673589 | 209.768962 | -258.847656 | 68.188904 | -259.185927 | 274.261148 | -435.935692 | -128.907053 | -152.824814 | 305.584327 | 82.645530 | -387.106206 | 25.530307 | -206.204084 | -130.271614 | 100.227668 | 180.664789 | -151.983899 | -54.080037 | 142.534916 | -12.863664 | 206.806269 |
| 5 | 1025.779580 | -247.771119 | 661.749767 | -83.006888 | -1078.052575 | -320.655344 | 14.408930 | -147.246333 | 219.631077 | 290.898648 | -171.020279 | 215.892937 | -97.050007 | 345.343208 | -315.034521 | -4.806259 | 43.360058 | -157.919949 | 80.989053 | -21.338615 | -17.252624 | -373.444898 | 48.900231 | -128.700081 | -13.318912 | -74.441954 | 271.323310 | 236.901966 | 36.175968 | 175.485405 |
| 6 | 1084.948320 | -136.642710 | 212.519483 | -105.504374 | -409.116153 | -478.700337 | -155.251812 | -190.552997 | -503.858509 | -58.071311 | -290.160640 | -262.465003 | 45.206327 | -54.162732 | 585.736764 | -160.480027 | -34.575323 | 127.571411 | -118.288044 | -6.410533 | 16.038249 | 55.347073 | 63.432193 | 41.293891 | 61.928386 | 460.904178 | 82.669067 | 233.391962 | 364.780093 | -128.916335 |
| 7 | 1539.068327 | 146.331724 | 856.573501 | -692.254340 | -200.373436 | -18.671059 | 176.275997 | 624.602079 | 828.229926 | -26.686015 | -252.282070 | 401.026654 | 107.363000 | 117.981422 | -14.773659 | -146.256496 | -390.672392 | -171.117009 | -108.953262 | -123.409165 | 88.071994 | -220.680189 | -7.726764 | -30.723416 | -125.017273 | 49.241258 | 170.635791 | 151.174137 | -218.916265 | -66.585026 |
| 8 | 1312.054961 | -426.413626 | 20.958925 | -346.814091 | -282.754826 | -153.682302 | 169.170536 | -454.741615 | 46.976695 | -223.889405 | -351.634266 | -415.658386 | -349.327530 | -728.627174 | 35.727738 | -54.631543 | 151.928100 | -234.468355 | -65.089419 | -56.776669 | 71.218199 | 183.062468 | 352.340275 | 112.151990 | 208.945094 | 99.667583 | -190.187510 | -204.119492 | 157.731306 | -172.583085 |
| 9 | 853.252388 | -358.873760 | 571.565692 | -206.495907 | -901.560432 | -161.733612 | -234.559537 | -101.872965 | -296.924871 | 316.403210 | -114.290460 | 131.169254 | -155.627317 | 715.033142 | -153.293891 | -201.550763 | 71.807271 | -100.811442 | 212.420161 | 174.422955 | 62.706495 | -98.290904 | 22.548092 | 112.480919 | 328.919519 | -42.581412 | 83.429604 | 435.443834 | 138.654737 | 187.514433 |
#picking one cluster center from each class
cluster_centers_allclasses = DIGITS_PCA_30_dataset.groupby('label').apply(lambda x: x.sample(1)).reset_index(drop=True)
cluster_centers_allclasses.drop(['label'], axis=1,inplace=True)
cluster_centers_allclasses.head()
| 0 | 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | 10 | 11 | 12 | 13 | 14 | 15 | 16 | 17 | 18 | 19 | 20 | 21 | 22 | 23 | 24 | 25 | 26 | 27 | 28 | 29 | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 1027.752065 | -244.678145 | 465.468845 | -527.612439 | -814.814163 | -320.928767 | -373.776641 | 5.892921 | -72.928350 | 164.506104 | 0.846011 | -109.744066 | -219.784001 | 182.798499 | 291.380213 | -242.206684 | -257.948816 | -106.107909 | 227.706126 | -28.090817 | 217.595364 | -42.730436 | 159.831140 | 145.012091 | 63.234263 | 184.377339 | 209.268775 | 101.526801 | 305.226537 | -170.874249 |
| 1 | -884.076670 | -439.985587 | 57.927437 | -72.046714 | -341.017626 | -400.798958 | 410.463443 | -259.024020 | 26.948038 | 162.451639 | 475.257808 | 243.017425 | -48.897723 | -134.756211 | 307.622866 | -129.860934 | -94.760297 | -216.370235 | -87.326774 | -43.307335 | -75.861337 | 65.388997 | 151.577595 | 19.010485 | -114.129509 | -115.347839 | -28.683085 | -48.040134 | -2.359464 | 256.514934 |
| 2 | 727.932539 | -76.990688 | 984.075786 | -155.116294 | -79.332911 | 355.758493 | 345.811429 | 88.288817 | -264.100535 | 505.926207 | 439.332662 | -351.916326 | -182.982404 | -244.997599 | -124.681894 | -64.757306 | -393.506053 | -7.960393 | -295.602495 | 81.768283 | 133.207832 | 179.198096 | 272.904681 | -364.562463 | -45.727567 | 42.628349 | -21.909418 | 134.047476 | -343.540370 | -314.292465 |
| 3 | -270.731462 | -137.761953 | -527.312647 | -218.644504 | 668.026240 | 244.705156 | 35.418742 | 72.747732 | -227.567966 | 590.507901 | 57.532947 | 108.541570 | -212.535653 | 325.987549 | 463.969679 | 22.380771 | -147.791566 | 314.094441 | 248.194372 | 153.268533 | -329.141577 | -22.089079 | -257.199419 | -122.974593 | 123.708121 | 65.945157 | 45.062804 | -30.353133 | -21.039216 | -75.827219 |
| 4 | 182.037623 | 946.997780 | 342.315656 | 827.243798 | -230.147281 | 383.199303 | -91.828932 | 103.421079 | -6.930702 | -149.524495 | 704.118567 | 264.463570 | 92.404438 | 251.224135 | 349.890751 | -19.404172 | 8.658991 | 98.809489 | 26.388725 | 124.174156 | -111.929443 | 115.415509 | -225.093791 | -102.336807 | 242.157418 | 12.097886 | 55.293899 | -109.351479 | -237.601500 | 48.974202 |
def checkPurity(y_rand, y_pred_rand):
mat_rand= metrics.confusion_matrix(y_pred_rand, y_rand)
#print(mat_rand)
#print(metrics.accuracy_score(y, y_pred_rand))
maxmat_rand = mat_rand.max(axis=1)
#print(mat_rand.max(axis=1))
#print(maxmat_rand.sum())
purity = maxmat_rand.sum()/mat_rand.sum()
return round(purity,3)
def runKmeanswithInitData(df_init):
ktest = KMeans(n_clusters=10,init=df_init,max_iter=200,tol=0.0001)
X,y = DIGITS_PCA_30_dataset.iloc[:, 0:30],DIGITS_PCA_30_dataset.iloc[:, 30]
# fitting the model to X
fitted = ktest.fit(X)
# predicting labels (y) and saving to y_pred
y_pred = ktest.predict(X)
return y,y_pred,fitted.n_iter_
def RunAll():
clusts_inits = []
iters_inits = []
purity_inits = []
print("Clusters with Inits for all different Initializations")
clusts_inits.append(0)
y,y_pred,n_iter_ = runKmeanswithInitData(cluster_centers_class0)
iters_inits.append(n_iter_)
pureVal_inits = checkPurity(y,y_pred)
purity_inits.append(pureVal_inits)
clusts_inits.append(1)
y,y_pred,n_iter_ = runKmeanswithInitData(cluster_centers_class1)
iters_inits.append(n_iter_)
pureVal_inits = checkPurity(y,y_pred)
purity_inits.append(pureVal_inits)
clusts_inits.append(2)
y,y_pred,n_iter_ = runKmeanswithInitData(cluster_centers_class2)
iters_inits.append(n_iter_)
pureVal_inits = checkPurity(y,y_pred)
purity_inits.append(pureVal_inits)
clusts_inits.append(3)
y,y_pred,n_iter_ = runKmeanswithInitData(cluster_centers_class3)
iters_inits.append(n_iter_)
pureVal_inits = checkPurity(y,y_pred)
purity_inits.append(pureVal_inits)
clusts_inits.append(4)
y,y_pred,n_iter_ = runKmeanswithInitData(cluster_centers_class4)
iters_inits.append(n_iter_)
pureVal_inits = checkPurity(y,y_pred)
purity_inits.append(pureVal_inits)
clusts_inits.append(5)
y,y_pred,n_iter_ = runKmeanswithInitData(cluster_centers_class5)
iters_inits.append(n_iter_)
pureVal_inits = checkPurity(y,y_pred)
purity_inits.append(pureVal_inits)
clusts_inits.append(6)
y,y_pred,n_iter_ = runKmeanswithInitData(cluster_centers_class6)
iters_inits.append(n_iter_)
pureVal_inits = checkPurity(y,y_pred)
purity_inits.append(pureVal_inits)
clusts_inits.append(7)
y,y_pred,n_iter_ = runKmeanswithInitData(cluster_centers_class7)
iters_inits.append(n_iter_)
pureVal_inits = checkPurity(y,y_pred)
purity_inits.append(pureVal_inits)
clusts_inits.append(8)
y,y_pred,n_iter_ = runKmeanswithInitData(cluster_centers_class8)
iters_inits.append(n_iter_)
pureVal_inits = checkPurity(y,y_pred)
purity_inits.append(pureVal_inits)
clusts_inits.append(9)
y,y_pred,n_iter_ = runKmeanswithInitData(cluster_centers_class9)
iters_inits.append(n_iter_)
pureVal_inits = checkPurity(y,y_pred)
purity_inits.append(pureVal_inits)
clusts_inits.append('All')
y,y_pred,n_iter_ = runKmeanswithInitData(cluster_centers_allclasses)
iters_inits.append(n_iter_)
pureVal_inits = checkPurity(y,y_pred)
purity_inits.append(pureVal_inits)
data = pd.DataFrame()
data['initialization_method_class'] = clusts_inits
data['iterations'] = iters_inits
data['purity_inits'] = purity_inits
return data
df = RunAll()
df
Clusters with Inits for all different Initializations
| initialization_method_class | iterations | purity_inits | |
|---|---|---|---|
| 0 | 0 | 44 | 0.589000 |
| 1 | 1 | 105 | 0.595000 |
| 2 | 2 | 173 | 0.595000 |
| 3 | 3 | 32 | 0.598000 |
| 4 | 4 | 105 | 0.579000 |
| 5 | 5 | 37 | 0.527000 |
| 6 | 6 | 59 | 0.595000 |
| 7 | 7 | 122 | 0.632000 |
| 8 | 8 | 70 | 0.589000 |
| 9 | 9 | 44 | 0.594000 |
| 10 | All | 73 | 0.632000 |
def runKmeansRandomInit(n):
# number of clusters to predict 3
ktest_rand = KMeans(n_clusters=n,random_state=0)
X_rand,y_rand = DIGITS_PCA_30_dataset.iloc[:, 0:30],DIGITS_PCA_30_dataset.iloc[:, 30]
# fitting the model to X
fitted_rand = ktest_rand.fit(X_rand)
# predicting labels (y) and saving to y_pred
y_pred_rand = ktest_rand.predict(X_rand)
return y_rand,y_pred_rand,fitted_rand.n_iter_
clusts = []
purity = []
for noofClusters in range(5,30,5):
print("Cluster")
print(noofClusters)
clusts.append(noofClusters)
y_rand,y_pred_rand,iters = runKmeansRandomInit(noofClusters)
#print(iters)
pureVal = checkPurity(y_rand,y_pred_rand)
purity.append(pureVal)
Cluster 5 Cluster 10 Cluster 15 Cluster 20 Cluster 25
data = pd.DataFrame()
data['clusters'] = clusts
data['purity'] = purity
ax = sns.scatterplot(x="clusters", y="purity", data=data)
#We will use the data without the labels as input for GMM
iris_dataset.iloc[:, 1:5].head()
| SepalLengthCm | SepalWidthCm | PetalLengthCm | PetalWidthCm | |
|---|---|---|---|---|
| 0 | 5.100000 | 3.500000 | 1.400000 | 0.200000 |
| 1 | 4.900000 | 3.000000 | 1.400000 | 0.200000 |
| 2 | 4.700000 | 3.200000 | 1.300000 | 0.200000 |
| 3 | 4.600000 | 3.100000 | 1.500000 | 0.200000 |
| 4 | 5.000000 | 3.600000 | 1.400000 | 0.200000 |
#Created function to fit the gaussian lAdded comments in the function
from sklearn.mixture import GaussianMixture
def runGMM(mixture):
gmm = GaussianMixture(n_components=mixture,random_state=0,covariance_type='full')
fit = gmm.fit(iris_dataset.iloc[:, 1:5])
#both the score with the training data passed and the lower_bound_ would return the same log likelihood
#values that we can the take exponent of which gives us the likelihood values that we later use for plotting
#print("score")
#gmm.score(iris_dataset.iloc[:, 1:5])
#print("lower bound")
return gmm.lower_bound_
mixtures = []
likelihood = []
for mix in range(1,33,2):
#print("Mixtures")
#print(mix)
mixtures.append(mix)
likeliVal =round(np.exp(runGMM(mix)),2)
likelihood.append(likeliVal)
#print("Likeli Val")
#print(likeliVal)
data = pd.DataFrame()
data['mixtures'] = mixtures
data['likelihood'] = likelihood
ax = sns.scatterplot(x="mixtures", y="likelihood", data=data)
ax.set(xticks=np.arange(1,35,2))
[[<matplotlib.axis.XTick at 0x1a2ae9e550>, <matplotlib.axis.XTick at 0x1a1c8b7a58>, <matplotlib.axis.XTick at 0x1a1d591668>, <matplotlib.axis.XTick at 0x1a29f93dd8>, <matplotlib.axis.XTick at 0x1a29f93fd0>, <matplotlib.axis.XTick at 0x1a29f93b38>, <matplotlib.axis.XTick at 0x1a2ae85710>, <matplotlib.axis.XTick at 0x1a2ae85f28>, <matplotlib.axis.XTick at 0x1a2ae85470>, <matplotlib.axis.XTick at 0x1a1f5afb70>, <matplotlib.axis.XTick at 0x1a1d886da0>, <matplotlib.axis.XTick at 0x1a2ae85cc0>, <matplotlib.axis.XTick at 0x1a1d88d208>, <matplotlib.axis.XTick at 0x1a218d9438>, <matplotlib.axis.XTick at 0x1a218d9080>, <matplotlib.axis.XTick at 0x1a218d9cf8>, <matplotlib.axis.XTick at 0x1a1cd10080>]]